From 85a27aced11843410e674ed9934fe8a1a0ac9c4b Mon Sep 17 00:00:00 2001 From: manu-sj Date: Mon, 15 Apr 2024 14:05:38 +0200 Subject: [PATCH 01/58] hopsworks_udf first version --- python/hsfs/hopsworks_udf.py | 264 +++++++++++++++++++++++++++++++++++ 1 file changed, 264 insertions(+) create mode 100644 python/hsfs/hopsworks_udf.py diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py new file mode 100644 index 0000000000..5be41a2ed5 --- /dev/null +++ b/python/hsfs/hopsworks_udf.py @@ -0,0 +1,264 @@ +# +# Copyright 2024 Hopsworks AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import ast +import inspect +import warnings +from typing import Callable, List, Union + +from hsfs import engine +from hsfs.client.exceptions import FeatureStoreException + + +def hopsworks_udf(return_type: Union[List[type], type]): + def wrapper(func: Callable): + udf = HopsworksUdf(func=func, return_type=return_type) + return udf + + return wrapper + + +class HopsworksUdf: + """ + Metadata class to store information about UDF + """ + + PYTHON_SPARK_TYPE_MAPPING = { + str: "string", + int: "int", + float: "float", + # "timestamp": TimestampType(), + bool: "boolean", + # "date": DateType(), + # "binary": BinaryType(), + } + + def __init__( + self, func: Callable, return_type: Union[List[type], type], name: str = None + ): + self.udf_function: Callable = func + if name is None: + self.function_name: str = func.__name__ + else: + self.function_name: str = name + self.return_type: Union[List[type], type] = return_type + self.function_source: str = self._remove_argument( + HopsworksUdf._extract_source_code(self.udf_function), "statistics" + ) + # TODO : Add a getter functions + self.transformation_features: List[str] = ( + HopsworksUdf._extract_function_arguments(self.function_source) + ) + HopsworksUdf.validate_arguments(self.return_type) + + def get_transformation_features(self): + return self.transformation_features + + @staticmethod + def validate_arguments(return_type): + if isinstance(return_type, list): + for python_type in return_type: + if not isinstance(python_type, type): + raise FeatureStoreException( + f'Return types provided must be a python type or a list of python types. "{python_type}" is not python type' + ) + else: + if not isinstance(return_type, type): + raise FeatureStoreException( + f'Return types provided must be a python type or a list of python types. "{return_type}" is not python type or a list' + ) + + @staticmethod + def _get_module_imports(path): + imports = [] + with open(path) as fh: + root = ast.parse(fh.read(), path) + + for node in ast.iter_child_nodes(root): + if isinstance(node, ast.Import): + imported_module = False + elif isinstance(node, ast.ImportFrom): + imported_module = node.module + else: + continue + + for n in node.names: + if imported_module: + import_line = "from " + imported_module + " import " + n.name + elif n.asname: + import_line = "import " + n.name + " as " + n.asname + else: + import_line = "import " + n.name + imports.append(import_line) + return imports + + @staticmethod + def _get_module_path(module_name): + def _get_module_path(module): + return module.__file__ + + module_path = {} + exec( + f'import {module_name}\nmodule_path["path"] = _get_module_path({module_name})' + ) + return module_path["path"] + + @staticmethod + def _extract_source_code(udf_function): + if not callable(udf_function): + # TODO : Think about a better text for the raised error + raise ValueError("transformation function must be callable") + + try: + module_imports = HopsworksUdf._get_module_imports( + HopsworksUdf._get_module_path(udf_function.__module__) + ) + except Exception: + module_imports = "" + # TODO : Check if warning is actually required. + warnings.warn( + "Passed UDF defined in a Jupyter notebook. Cannot extract dependices from a notebook. Please make sure to import all dependcies for the UDF inside the code.", + stacklevel=2, + ) + + function_code = inspect.getsource(udf_function) + source_code = "\n".join(module_imports) + "\n" + function_code + + return source_code + + @staticmethod + def _extract_function_arguments(source_code): + # Get source code of the original function + source_code = source_code.split("\n") + + # Find the line where the function signature is defined + for i, line in enumerate(source_code): + if line.strip().startswith("def "): + signature_line = i + break + + # Parse the function signature to remove the specified argument + signature = source_code[signature_line] + arg_list = signature.split("(")[1].split(")")[0].split(",") + arg_list = [arg.strip() for arg in arg_list] + return arg_list + + def _remove_argument(self, source_code: str, arg_to_remove: str): + """ " + Function to remove statistics arguments from passed udf and type hinting. + Statistics arguments are removed since pandas UDF's do not accept extra arguments. + Statistics parameters are dynamically injected into the function scope. + """ + + # Get source code of the original function + source_code = source_code.split("\n") + + # Find the line where the function signature is defined + for i, line in enumerate(source_code): + if line.strip().startswith("def "): + signature_line = i + break + + # Parse the function signature to remove the specified argument + signature = source_code[signature_line] + arg_list = signature.split("(")[1].split(")")[0].split(",") + arg_list = [ + arg.split(":")[0].strip() + for arg in arg_list + if ( + arg_to_remove not in list(map(str.strip, arg.split(" "))) + and arg_to_remove not in list(map(str.strip, arg.split(":"))) + and arg.strip() != arg_to_remove + ) + ] + + # Reconstruct the function signature + new_signature = ( + signature.split("(")[0] + + "(" + + ", ".join(arg_list) + + ")" + + signature.split(")")[1] + ) + + # Modify the source code to reflect the changes + source_code[signature_line] = new_signature + + # Removing test before function signatre since they are decorators + source_code = source_code[signature_line:] + + # Reconstruct the modified function as a string + modified_source = "\n".join(source_code) + + # Define a new function with the modified source code + return modified_source + + @staticmethod + def get_spark_type(python_type: type): + return HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[python_type] + + def create_pandas_udf_return_schema_from_list(self, return_types: List[type]): + return ", ".join( + [ + f'`{self.function_name}({",".join(self.transformation_features)})_{i}` {HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[return_types[i]]}' + for i in range(len(return_types)) + ] + ) + + def hopsworksUdf_wrapper(self, **statistics): + # TODO : clean this up + if isinstance(self.return_type, List): + self.function_source = "\t".join(self.function_source.splitlines(True)) + self.code = f"""def renaming_wrapper(*args): + import pandas as pd + {self.function_source} + df = {self.function_name}(*args) + #raise Exception({{f'{{df.columns[i]}}':f'{self.function_name}{",".join(self.transformation_features)}_{{i}}' for i in range(len(df.columns))}}) + df = df.rename(columns = {{f'{{df.columns[i]}}':f'{self.function_name}({",".join(self.transformation_features)})_{{i}}' for i in range(len(df.columns))}}) + return df""" + else: + self.code = self.function_source + scope = __import__("__main__").__dict__ + scope.update(**statistics) + exec(self.code, scope) + if isinstance(self.transformation_features, List): + return eval("renaming_wrapper", scope) + else: + return eval(self.function_name, scope) + + def __call__(self, *args: List[str]): + for arg in args: + if not isinstance(arg, str): + raise FeatureStoreException( + f'Feature names provided must be string "{arg}" is not string' + ) + + self.transformation_features = list(args) + return self + + def get_udf(self, statistics): + if engine.get_type() in ["hive", "python", "training"]: + return self.hopsworksUdf_wrapper(statistics=statistics) + else: + from pyspark.sql.functions import pandas_udf + + # TODO : Make this proper + return pandas_udf( + f=self.hopsworksUdf_wrapper(statistics=statistics), + returnType=self.create_pandas_udf_return_schema_from_list( + self.return_type + ), + ) From 9e4478bab90b8ee386819520a826f3c6b8da64e2 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Wed, 17 Apr 2024 15:26:40 +0200 Subject: [PATCH 02/58] working code for running hopsworks udf without saving in backend using python client --- python/hsfs/constructor/query.py | 2 +- python/hsfs/core/feature_view_engine.py | 19 +- .../core/transformation_function_engine.py | 6 + python/hsfs/engine/python.py | 89 ++++++--- python/hsfs/engine/spark.py | 90 +++++---- python/hsfs/feature_store.py | 10 +- python/hsfs/feature_view.py | 34 ++-- python/hsfs/hopsworks_udf.py | 6 +- python/hsfs/training_dataset_feature.py | 20 +- python/hsfs/transformation_function.py | 176 +++--------------- 10 files changed, 186 insertions(+), 266 deletions(-) diff --git a/python/hsfs/constructor/query.py b/python/hsfs/constructor/query.py index e305e8ca5a..5e527b6f13 100644 --- a/python/hsfs/constructor/query.py +++ b/python/hsfs/constructor/query.py @@ -59,7 +59,7 @@ def __init__( fg_mod.ExternalFeatureGroup, fg_mod.SpineGroup, ], - left_features: List[Union[str, "Feature"]], + left_features: List[Union[str, "Feature", Dict]], feature_store_name: Optional[str] = None, feature_store_id: Optional[int] = None, left_feature_group_start_time: Optional[Union[str, int, date, datetime]] = None, diff --git a/python/hsfs/core/feature_view_engine.py b/python/hsfs/core/feature_view_engine.py index dd49fa5e21..770a772af6 100644 --- a/python/hsfs/core/feature_view_engine.py +++ b/python/hsfs/core/feature_view_engine.py @@ -120,9 +120,9 @@ def save(self, feature_view_obj): ) ) - self._transformation_function_engine.attach_transformation_fn(feature_view_obj) + # TODO : Remove this code portion attaches a transfromation function to a feature. This is not possible with the current implementation + # self._transformation_function_engine.attach_transformation_fn(feature_view_obj) updated_fv = self._feature_view_api.post(feature_view_obj) - self.attach_transformation_function(updated_fv) print( "Feature view created successfully, explore it at \n" + self._get_feature_view_url(updated_fv) @@ -136,25 +136,10 @@ def update(self, feature_view_obj): def get(self, name, version=None): if version: fv = self._feature_view_api.get_by_name_version(name, version) - self.attach_transformation_function(fv) else: fv = self._feature_view_api.get_by_name(name) - for _fv in fv: - self.attach_transformation_function(_fv) return fv - def attach_transformation_function(self, fv: "feature_view.FeatureView"): - fv.transformation_functions = ( - self._transformation_function_engine.get_fv_attached_transformation_fn( - fv.name, fv.version - ) - ) - if fv.transformation_functions: - for feature in fv.schema: - feature.transformation_function = fv.transformation_functions.get( - feature.name, None - ) - def delete(self, name, version=None): if version: return self._feature_view_api.delete_by_name_version(name, version) diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py index 4d1db1df04..beeceb2bd8 100644 --- a/python/hsfs/core/transformation_function_engine.py +++ b/python/hsfs/core/transformation_function_engine.py @@ -111,6 +111,7 @@ def get_td_transformation_fn(self, training_dataset): @staticmethod def attach_transformation_fn(training_dataset_obj=None, feature_view_obj=None): + # TODO : Remove transformation function attached to training dataset object and features if training_dataset_obj: target_obj = training_dataset_obj # todo why provide td and fv just to convert to target_obj? else: @@ -150,6 +151,7 @@ def is_builtin(self, transformation_fn_instance): def populate_builtin_fn_arguments( feature_name, transformation_function_instance, feature_descriptive_stats ): + # TODO : Make this statistics if transformation_function_instance.name == "min_max_scaler": min_value, max_value = BuiltInTransformationFunction.min_max_scaler_stats( feature_descriptive_stats, feature_name @@ -194,6 +196,7 @@ def populate_builtin_fn_arguments( def populate_builtin_attached_fns( self, attached_transformation_fns, feature_descriptive_stats ): + # TODO : Remove for ft_name in attached_transformation_fns: if self.is_builtin(attached_transformation_fns[ft_name]): # check if its built-in transformation function and populated with statistics arguments @@ -207,6 +210,7 @@ def populate_builtin_attached_fns( @staticmethod def infer_spark_type(output_type): + # TODO : Move to hopsworks_udf if not output_type: return "STRING" # STRING is default type for spark udfs @@ -265,6 +269,8 @@ def compute_transformation_fn_statistics( def populate_builtin_transformation_functions( training_dataset, feature_view_obj, dataset ): + return + # TODO : Remove # check if there any transformation functions that require statistics attached to td features builtin_tffn_label_encoder_features = [ ft_name diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py index 8e64e6ec95..9e256c322d 100644 --- a/python/hsfs/engine/python.py +++ b/python/hsfs/engine/python.py @@ -30,7 +30,7 @@ from datetime import datetime, timezone from io import BytesIO from pathlib import Path -from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union, TYPE_CHECKING import avro import boto3 @@ -48,7 +48,6 @@ feature, feature_store, feature_view, - transformation_function_attached, util, ) from hsfs import storage_connector as sc @@ -81,6 +80,10 @@ from tqdm.auto import tqdm +if TYPE_CHECKING: + from hsfs.transformation_function import TransformationFunction + + # Disable pyhive INFO logging logging.getLogger("pyhive").setLevel(logging.WARNING) @@ -893,6 +896,7 @@ def get_training_data( df = query_obj.read( read_options=read_options, dataframe_type=dataframe_type ) + # TODO : Add statistics transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions( training_dataset_obj, feature_view_obj, df ) @@ -1228,39 +1232,70 @@ def add_file(self, file: Optional[str]) -> Optional[str]: def _apply_transformation_function( self, - transformation_functions: Dict[ - str, transformation_function_attached.TransformationFunctionAttached - ], + transformation_functions: List[TransformationFunction], dataset: Union[pd.DataFrame, pl.DataFrame], ) -> Union[pd.DataFrame, pl.DataFrame]: - for ( - feature_name, - transformation_fn, - ) in transformation_functions.items(): + transformed_features = set() + for transformation_function in transformation_functions: + hopsworks_udf = transformation_function.hopsworks_udf + missing_features = set(hopsworks_udf.transformation_features) - set( + dataset.columns + ) + + # TODO : Add documentation link in exception + if missing_features: + raise FeatureStoreException( + f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly. Refer .." + ) + + transformed_features.update( + transformation_function.hopsworks_udf.transformation_features + ) + if isinstance(dataset, pl.DataFrame) or isinstance( dataset, pl.dataframe.frame.DataFrame ): - dataset = dataset.with_columns( - pl.col(feature_name).map_elements( - transformation_fn.transformation_fn - ) - ) + pass else: - dataset[feature_name] = dataset[feature_name].map( - transformation_fn.transformation_fn - ) - # The below functions is not required for Polars since polars does have object types like pandas - if not ( - isinstance(dataset, pl.DataFrame) - or isinstance(dataset, pl.dataframe.frame.DataFrame) - ): - offline_type = Engine.convert_spark_type_to_offline_type( - transformation_fn.output_type - ) - dataset[feature_name] = Engine._cast_column_to_offline_type( - dataset[feature_name], offline_type + dataset = pd.concat( + [ + dataset, + transformation_function.hopsworks_udf.get_udf(statistics=None)( + *( + [ + dataset[feature] + for feature in transformation_function.hopsworks_udf.transformation_features + ] + ) + ), + ], + axis=1, ) + # TODO : Think about what to do in cases where the output is a polars dataframe..... + # if isinstance(dataset, pl.DataFrame) or isinstance( + # dataset, pl.dataframe.frame.DataFrame + # ): + # dataset = dataset.with_columns( + # pl.col(feature_name).map_elements( + # transformation_fn.transformation_fn + # ) + # ) + # else: + + # TODO : Think if below code is actually required + # The below functions is not required for Polars since polars does have object types like pandas + # if not ( + # isinstance(dataset, pl.DataFrame) + # or isinstance(dataset, pl.dataframe.frame.DataFrame) + # ): + # offline_type = Engine.convert_spark_type_to_offline_type( + # transformation_fn.output_type + # ) + # dataset[feature_name] = Engine._cast_column_to_offline_type( + # dataset[feature_name], offline_type + # ) + dataset = dataset.drop(transformed_features, axis=1) return dataset @staticmethod diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py index b9f8621cfc..0b69abecdd 100644 --- a/python/hsfs/engine/spark.py +++ b/python/hsfs/engine/spark.py @@ -23,13 +23,16 @@ import shutil import warnings from datetime import date, datetime, timezone -from typing import Any, List, Optional, TypeVar, Union +from typing import Any, List, Optional, TypeVar, Union, TYPE_CHECKING import avro import numpy as np import pandas as pd import tzlocal +if TYPE_CHECKING: + from hsfs.transformation_function import TransformationFunction + # in case importing in %%local from hsfs.core.vector_db_client import VectorDbClient @@ -586,6 +589,7 @@ def write_training_dataset( feature_view_obj=None, to_df=False, ): + print("[SPARK] write_training_dataset") write_options = self.write_options( training_dataset.data_format, user_write_options ) @@ -810,6 +814,7 @@ def _write_training_dataset_single( path, to_df=False, ): + print("[SPARK] _write_training_dataset_single") # apply transformation functions (they are applied separately to each split) feature_dataframe = self._apply_transformation_function( transformation_functions, dataset=feature_dataframe @@ -1162,23 +1167,42 @@ def add_cols_to_delta_table(self, feature_group, new_features): "spark.databricks.delta.schema.autoMerge.enabled", "true" ).save(feature_group.location) - def _apply_transformation_function(self, transformation_functions, dataset): + def _apply_transformation_function( + self, transformation_functions: List[TransformationFunction], dataset + ): # generate transformation function expressions - transformed_feature_names = [] - transformation_fn_expressions = [] - for ( - feature_name, - transformation_fn, - ) in transformation_functions.items(): - fn_registration_name = ( - transformation_fn.name - + "_" - + str(transformation_fn.version) - + "_" - + feature_name + print("[SPARK] _apply_transformation_function") + transformed_features = set() + transformations = [] + transformation_features = [] + explode_name = [] + for transformation_function in transformation_functions: + hopsworks_udf = transformation_function.hopsworks_udf + missing_features = set(hopsworks_udf.transformation_features) - set( + dataset.columns + ) + + # TODO : Add documentation link in exception + if missing_features: + raise FeatureStoreException( + f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly. Refer .." + ) + + transformed_features.update( + transformation_function.hopsworks_udf.transformation_features ) - def timezone_decorator(func, trans_fn=transformation_fn): + # TODO : Add statistics + pandas_udf = hopsworks_udf.get_udf(None) + transformations.append(pandas_udf) + transformation_features.append(hopsworks_udf.transformation_features) + + if isinstance(hopsworks_udf.return_type, List): + explode_name.append( + f'{pandas_udf.__name__}({", ".join(hopsworks_udf.transformation_features)}).*' + ) + + def timezone_decorator(func, trans_fn=hopsworks_udf): if trans_fn.output_type != "TIMESTAMP": return func @@ -1200,29 +1224,27 @@ def decorated_func(x): return decorated_func - self._spark_session.udf.register( - fn_registration_name, - timezone_decorator(transformation_fn.transformation_fn), - transformation_fn.output_type, - ) - transformation_fn_expressions.append( - "{fn_name:}({name:}) AS {name:}".format( - fn_name=fn_registration_name, name=feature_name - ) - ) - transformed_feature_names.append(feature_name) + # TODO : Timezone aware check see if I need to do also. + # self._spark_session.udf.register( + # fn_registration_name, + # timezone_decorator(transformation_fn.transformation_fn), + # transformation_fn.output_type, + # ) # generate non transformation expressions - no_transformation_expr = [ - "{name:} AS {name:}".format(name=col_name) - for col_name in dataset.columns - if col_name not in transformed_feature_names - ] # generate entire expression and execute it - transformation_fn_expressions.extend(no_transformation_expr) - transformed_dataset = dataset.selectExpr(*transformation_fn_expressions) - return transformed_dataset.select(*dataset.columns) + + untransformed_columns = set(dataset.columns) - transformed_features + transformed_dataset = dataset.select( + *untransformed_columns, + *[ + fun(*feature) + for fun, feature in zip(transformations, transformation_features) + ], + ).select(*untransformed_columns, *explode_name) + + return transformed_dataset def _setup_gcp_hadoop_conf(self, storage_connector, path): PROPERTY_ENCRYPTION_KEY = "fs.gs.encryption.key" diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py index c8a18dc6c0..24033bf11b 100644 --- a/python/hsfs/feature_store.py +++ b/python/hsfs/feature_store.py @@ -18,7 +18,7 @@ import datetime import warnings -from typing import Any, Dict, List, Optional, TypeVar, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeVar, Union import great_expectations as ge import humps @@ -53,6 +53,10 @@ from hsfs.transformation_function import TransformationFunction +if TYPE_CHECKING: + from hsfs.hopsworks_udf import HopsworksUdf + + @typechecked class FeatureStore: DEFAULT_VERSION = 1 @@ -1464,7 +1468,9 @@ def create_feature_view( labels: Optional[List[str]] = None, inference_helper_columns: Optional[List[str]] = None, training_helper_columns: Optional[List[str]] = None, - transformation_functions: Optional[Dict[str, TransformationFunction]] = None, + transformation_functions: Optional[ + List[Union[TransformationFunction, HopsworksUdf]] + ] = None, ) -> feature_view.FeatureView: """Create a feature view metadata object and saved it to hopsworks. diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py index 4f6a9dbb8e..82e45e4b2c 100644 --- a/python/hsfs/feature_view.py +++ b/python/hsfs/feature_view.py @@ -15,7 +15,6 @@ # from __future__ import annotations -import copy import json import logging import warnings @@ -36,7 +35,6 @@ util, ) from hsfs import serving_key as skm -from hsfs import transformation_function as tfm from hsfs.client.exceptions import FeatureStoreException from hsfs.constructor import filter, query from hsfs.constructor.filter import Filter, Logic @@ -59,6 +57,11 @@ from hsfs.statistics import Statistics from hsfs.statistics_config import StatisticsConfig from hsfs.training_dataset_split import TrainingDatasetSplit +from hsfs.transformation_function import TransformationFunction + + +if TYPE_CHECKING: + from hsfs.hopsworks_udf import HopsworksUdf _logger = logging.getLogger(__name__) @@ -98,7 +101,7 @@ def __init__( inference_helper_columns: Optional[List[str]] = None, training_helper_columns: Optional[List[str]] = None, transformation_functions: Optional[ - Dict[str, tfm.TransformationFunction] + List[Union[TransformationFunction, HopsworksUdf]] ] = None, featurestore_name: Optional[str] = None, serving_keys: Optional[List[skm.ServingKey]] = None, @@ -119,14 +122,21 @@ def __init__( self._training_helper_columns = ( training_helper_columns if training_helper_columns else [] ) - self._transformation_functions = ( - { - ft_name: copy.deepcopy(transformation_functions[ft_name]) - for ft_name in transformation_functions - } - if transformation_functions - else {} + + # TODO : Clean this up + if transformation_functions: + for i, transformation_function in enumerate(transformation_functions): + if not isinstance(transformation_function, TransformationFunction): + transformation_functions[i] = TransformationFunction( + self.featurestore_id, + hopsworks_udf=transformation_function, + version=1, + ) + + self._transformation_functions: List[TransformationFunction] = ( + transformation_functions ) + self._features = [] self._feature_view_engine: feature_view_engine.FeatureViewEngine = ( feature_view_engine.FeatureViewEngine(featurestore_id) @@ -3578,14 +3588,14 @@ def query(self, query_obj: "query.Query") -> None: @property def transformation_functions( self, - ) -> Dict[str, tfm.TransformationFunction]: + ) -> List[TransformationFunction]: """Get transformation functions.""" return self._transformation_functions @transformation_functions.setter def transformation_functions( self, - transformation_functions: Dict[str, tfm.TransformationFunction], + transformation_functions: List[TransformationFunction], ) -> None: self._transformation_functions = transformation_functions diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index 5be41a2ed5..b2a8bae274 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -213,7 +213,7 @@ def get_spark_type(python_type: type): def create_pandas_udf_return_schema_from_list(self, return_types: List[type]): return ", ".join( [ - f'`{self.function_name}({",".join(self.transformation_features)})_{i}` {HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[return_types[i]]}' + f'`{self.function_name}<{"-".join(self.transformation_features)}>{i}` {HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[return_types[i]]}' for i in range(len(return_types)) ] ) @@ -226,8 +226,7 @@ def hopsworksUdf_wrapper(self, **statistics): import pandas as pd {self.function_source} df = {self.function_name}(*args) - #raise Exception({{f'{{df.columns[i]}}':f'{self.function_name}{",".join(self.transformation_features)}_{{i}}' for i in range(len(df.columns))}}) - df = df.rename(columns = {{f'{{df.columns[i]}}':f'{self.function_name}({",".join(self.transformation_features)})_{{i}}' for i in range(len(df.columns))}}) + df = df.rename(columns = {{f'{{df.columns[i]}}':f'{self.function_name}<{"-".join(self.transformation_features)}>{{i}}' for i in range(len(df.columns))}}) return df""" else: self.code = self.function_source @@ -240,6 +239,7 @@ def hopsworksUdf_wrapper(self, **statistics): return eval(self.function_name, scope) def __call__(self, *args: List[str]): + # TODO : Raise an execption if the number of features are incorrect. for arg in args: if not isinstance(arg, str): raise FeatureStoreException( diff --git a/python/hsfs/training_dataset_feature.py b/python/hsfs/training_dataset_feature.py index 6c3a04ea3d..c444e833c7 100644 --- a/python/hsfs/training_dataset_feature.py +++ b/python/hsfs/training_dataset_feature.py @@ -18,10 +18,8 @@ import humps from hsfs import feature as feature_mod from hsfs import feature_group as feature_group_mod -from hsfs import transformation_function as tf_mod from hsfs import util - class TrainingDatasetFeature: def __init__( self, @@ -33,7 +31,6 @@ def __init__( label=False, inference_helper_column=False, training_helper_column=False, - transformation_function=None, **kwargs, ): self._name = util.autofix_feature_name(name) @@ -48,11 +45,6 @@ def __init__( self._label = label self._inference_helper_column = inference_helper_column self._training_helper_column = training_helper_column - self._transformation_function = ( - tf_mod.TransformationFunction.from_response_json(transformation_function) - if isinstance(transformation_function, dict) - else transformation_function - ) def to_dict(self): return { @@ -62,7 +54,6 @@ def to_dict(self): "label": self._label, "inferenceHelperColumn": self._inference_helper_column, "trainingHelperColumn": self._training_helper_column, - "transformationFunction": self._transformation_function, "featureGroupFeatureName": self._feature_group_feature_name, "featuregroup": self._feature_group, } @@ -127,15 +118,6 @@ def training_helper_column(self): def training_helper_column(self, training_helper_column): self._training_helper_column = training_helper_column - @property - def transformation_function(self): - """Set transformation functions.""" - return self._transformation_function - - @transformation_function.setter - def transformation_function(self, transformation_function): - self._transformation_function = transformation_function - @property def feature_group(self): return self._feature_group @@ -145,4 +127,4 @@ def feature_group_feature_name(self): return self._feature_group_feature_name def __repr__(self): - return f"Training Dataset Feature({self._name!r}, {self._type!r}, {self._index!r}, {self._label}, {self._transformation_function}, {self._feature_group_feature_name}, {self._feature_group.id!r})" + return f"Training Dataset Feature({self._name!r}, {self._type!r}, {self._index!r}, {self._label}, {self._feature_group_feature_name}, {self._feature_group.id!r})" diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py index ffd88fd502..fee9f1f41e 100644 --- a/python/hsfs/transformation_function.py +++ b/python/hsfs/transformation_function.py @@ -14,26 +14,28 @@ # from __future__ import annotations -import ast -import inspect import json +from typing import TYPE_CHECKING, Optional import humps from hsfs import util from hsfs.core import transformation_function_engine +from hsfs.decorators import typechecked +if TYPE_CHECKING: + from hsfs.hopsworks_udf import HopsworksUdf + + +@typechecked class TransformationFunction: def __init__( self, - featurestore_id, - transformation_fn=None, - version=None, - name=None, - source_code_content=None, - builtin_source_code=None, - output_type=None, - id=None, + featurestore_id: int, + hopsworks_udf: HopsworksUdf, + version: Optional[int] = None, + id: Optional[int] = None, + # TODO : Check if the below are actually needed type=None, items=None, count=None, @@ -43,45 +45,16 @@ def __init__( self._id = id self._featurestore_id = featurestore_id self._version = version - self._name = name - self._transformation_fn = transformation_fn - self._source_code_content = source_code_content self._transformation_function_engine = ( transformation_function_engine.TransformationFunctionEngine( self._featurestore_id ) ) - - # set up depending on user initialized - if self._transformation_fn is not None: - # type -> user init coming from user - self._transformer_code = None - self._extract_source_code() - self._output_type = self._transformation_function_engine.infer_spark_type( - output_type - ) - elif builtin_source_code is not None: - # user triggered to register built-in transformation function - self._output_type = self._transformation_function_engine.infer_spark_type( - output_type - ) - self._source_code_content = json.dumps( - { - "module_imports": "", - "transformer_code": builtin_source_code, - } - ) - else: - # load backend response - # load original source code - self._output_type = self._transformation_function_engine.infer_spark_type( - output_type - ) - self._load_source_code(self._source_code_content) - - self._feature_group_feature_name = None - self._feature_group_id = None + self._hopsworks_udf = hopsworks_udf + self._name = hopsworks_udf.function_name + self._feature_group_feature_name: Optional[str] = None + self._feature_group_id: Optional[int] = None def save(self): """Persist transformation function in backend. @@ -132,77 +105,6 @@ def plus_one(value): """ self._transformation_function_engine.delete(self) - def _extract_source_code(self): - if not callable(self._transformation_fn): - raise ValueError("transformer must be callable") - - self._name = self._transformation_fn.__name__ - - transformer_code = inspect.getsource(self._transformation_fn) - - module_imports = self._get_module_imports( - self._get_module_path(self._transformation_fn.__module__) - ) - - self._transformer_code = "\n".join(module_imports) + "\n" + transformer_code - - # initialise source code dict - # add all imports from module - # add original source code that will be used during offline transformations - self._source_code_content = json.dumps( - { - "module_imports": "\n".join(module_imports), - "transformer_code": transformer_code, - } - ) - - @staticmethod - def _get_module_path(module_name): - def _get_module_path(module): - return module.__file__ - - module_path = {} - exec( - """import %s\nmodule_path["path"] = _get_module_path(%s)""" - % (module_name, module_name) - ) - return module_path["path"] - - @staticmethod - def _get_module_imports(path): - imports = [] - with open(path) as fh: - root = ast.parse(fh.read(), path) - - for node in ast.iter_child_nodes(root): - if isinstance(node, ast.Import): - imported_module = False - elif isinstance(node, ast.ImportFrom): - imported_module = node.module - else: - continue - - for n in node.names: - if imported_module: - import_line = "from " + imported_module + " import " + n.name - elif n.asname: - import_line = "import " + n.name + " as " + n.asname - else: - import_line = "import " + n.name - imports.append(import_line) - return imports - - def _load_source_code(self, source_code_content): - source_code_content = json.loads(source_code_content) - module_imports = source_code_content["module_imports"] - transformer_code = source_code_content["transformer_code"] - self._transformer_code = module_imports + "\n" * 2 + transformer_code - - scope = __import__("__main__").__dict__ - exec(self._transformer_code, scope) - self._transformation_fn = eval(self._name, scope) - self._transformation_fn._code = self._transformer_code - @classmethod def from_response_json(cls, json_dict): json_decamelized = humps.decamelize(json_dict) @@ -232,58 +134,30 @@ def to_dict(self): } @property - def id(self): - """Training dataset id.""" + def id(self) -> id: + """Transformation function id.""" return self._id @id.setter - def id(self, id): + def id(self, id: int): self._id = id @property - def name(self): + def name(self) -> str: return self._name @property - def version(self): + def version(self) -> int: return self._version @property - def transformer_code(self): - return self._transformer_code - - @property - def transformation_fn(self): - return self._transformation_fn - - @property - def source_code_content(self): - return self._source_code_content - - @property - def output_type(self): - return self._output_type + def hopsworks_udf(self) -> HopsworksUdf: + return self._hopsworks_udf @name.setter - def name(self, name): + def name(self, name: str): self._name = name @version.setter - def version(self, version): + def version(self, version: int): self._version = version - - @transformer_code.setter - def transformer_code(self, transformer_code): - self._transformer_code = transformer_code - - @transformation_fn.setter - def transformation_fn(self, transformation_fn): - self._transformation_fn = transformation_fn - - @source_code_content.setter - def source_code_content(self, source_code_content): - self._source_code_content = source_code_content - - @output_type.setter - def output_type(self, output_type): - self._output_type = output_type From 75441a3d8d6eac03b4c001d2ea2bdc7382ec7902 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Wed, 17 Apr 2024 15:43:10 +0200 Subject: [PATCH 03/58] removing debugging logs --- python/hsfs/engine/spark.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py index 0b69abecdd..3e7e40a54a 100644 --- a/python/hsfs/engine/spark.py +++ b/python/hsfs/engine/spark.py @@ -589,7 +589,6 @@ def write_training_dataset( feature_view_obj=None, to_df=False, ): - print("[SPARK] write_training_dataset") write_options = self.write_options( training_dataset.data_format, user_write_options ) @@ -814,7 +813,6 @@ def _write_training_dataset_single( path, to_df=False, ): - print("[SPARK] _write_training_dataset_single") # apply transformation functions (they are applied separately to each split) feature_dataframe = self._apply_transformation_function( transformation_functions, dataset=feature_dataframe @@ -1171,7 +1169,6 @@ def _apply_transformation_function( self, transformation_functions: List[TransformationFunction], dataset ): # generate transformation function expressions - print("[SPARK] _apply_transformation_function") transformed_features = set() transformations = [] transformation_features = [] From 7af03f2cbe3f876713312bdd7e109102214ecc68 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Thu, 18 Apr 2024 15:12:42 +0200 Subject: [PATCH 04/58] statistics working with python client --- python/hsfs/core/feature_view_engine.py | 2 +- .../core/transformation_function_engine.py | 110 +++------ python/hsfs/engine/python.py | 6 +- python/hsfs/engine/spark.py | 6 +- python/hsfs/feature_view.py | 8 + python/hsfs/hopsworks_udf.py | 211 +++++++++++++++--- python/hsfs/transformation_function.py | 16 +- 7 files changed, 229 insertions(+), 130 deletions(-) diff --git a/python/hsfs/core/feature_view_engine.py b/python/hsfs/core/feature_view_engine.py index 770a772af6..e954701d8e 100644 --- a/python/hsfs/core/feature_view_engine.py +++ b/python/hsfs/core/feature_view_engine.py @@ -121,7 +121,7 @@ def save(self, feature_view_obj): ) # TODO : Remove this code portion attaches a transfromation function to a feature. This is not possible with the current implementation - # self._transformation_function_engine.attach_transformation_fn(feature_view_obj) + updated_fv = self._feature_view_api.post(feature_view_obj) print( "Feature view created successfully, explore it at \n" diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py index beeceb2bd8..492567e6ec 100644 --- a/python/hsfs/core/transformation_function_engine.py +++ b/python/hsfs/core/transformation_function_engine.py @@ -109,38 +109,6 @@ def get_td_transformation_fn(self, training_dataset): ) return transformation_fn_dict - @staticmethod - def attach_transformation_fn(training_dataset_obj=None, feature_view_obj=None): - # TODO : Remove transformation function attached to training dataset object and features - if training_dataset_obj: - target_obj = training_dataset_obj # todo why provide td and fv just to convert to target_obj? - else: - target_obj = feature_view_obj - - if target_obj._transformation_functions: - for ( - feature_name, - transformation_fn, - ) in target_obj._transformation_functions.items(): - if feature_name in target_obj.labels: - raise ValueError( - "Online transformations for training dataset labels are not supported." - ) - - feature, prefix, featuregroup = target_obj.query._get_feature_by_name( - feature_name - ) - target_obj._features.append( - training_dataset_feature.TrainingDatasetFeature( - name=feature_name, - feature_group_feature_name=feature.name, - featuregroup=featuregroup, - type=transformation_fn.output_type, - label=False, - transformation_function=transformation_fn, - ) - ) - def is_builtin(self, transformation_fn_instance): return ( transformation_fn_instance.name in self.BUILTIN_FN_NAMES @@ -249,6 +217,7 @@ def infer_spark_type(output_type): else: raise TypeError("Not supported type %s." % output_type) + # TODO : Think about what to do with label encoder features. @staticmethod def compute_transformation_fn_statistics( training_dataset_obj, @@ -266,58 +235,35 @@ def compute_transformation_fn_statistics( ) @staticmethod - def populate_builtin_transformation_functions( - training_dataset, feature_view_obj, dataset - ): - return - # TODO : Remove - # check if there any transformation functions that require statistics attached to td features - builtin_tffn_label_encoder_features = [ - ft_name - for ft_name in training_dataset.transformation_functions - if training_dataset._transformation_function_engine.is_builtin( - training_dataset.transformation_functions[ft_name] - ) - and training_dataset.transformation_functions[ft_name].name - == "label_encoder" - ] - builtin_tffn_features = [ - ft_name - for ft_name in training_dataset.transformation_functions - if training_dataset._transformation_function_engine.is_builtin( - training_dataset.transformation_functions[ft_name] + def add_feature_statistics(training_dataset, feature_view_obj, dataset): + # TODO : Optimize this code portion check which i better computing all transformation feature statistics together or one by one. + statistics_features = set() + for transformation_function in feature_view_obj.transformation_functions: + statistics_features.update( + transformation_function.hopsworks_udf.statistics_features ) - and training_dataset.transformation_functions[ft_name].name - != "label_encoder" - ] - if builtin_tffn_features or builtin_tffn_label_encoder_features: - if training_dataset.splits: - # compute statistics before transformations are applied - stats = ( - TransformationFunctionEngine.compute_transformation_fn_statistics( - training_dataset, - builtin_tffn_features, - builtin_tffn_label_encoder_features, - dataset.get(training_dataset.train_split), - feature_view_obj, - ) - ) - else: - # compute statistics before transformations are applied - stats = ( - TransformationFunctionEngine.compute_transformation_fn_statistics( - training_dataset, - builtin_tffn_features, - builtin_tffn_label_encoder_features, - dataset, - feature_view_obj, - ) - ) - # Populate builtin transformations (if any) with respective arguments - return training_dataset._transformation_function_engine.populate_builtin_attached_fns( - training_dataset.transformation_functions, - stats.feature_descriptive_statistics, + if training_dataset.splits: + # compute statistics before transformations are applied + stats = TransformationFunctionEngine.compute_transformation_fn_statistics( + training_dataset, + list(statistics_features), + [], + dataset.get(training_dataset.train_split), + feature_view_obj, + ) + else: + # compute statistics before transformations are applied + stats = TransformationFunctionEngine.compute_transformation_fn_statistics( + training_dataset, + list(statistics_features), + [], + dataset, + feature_view_obj, + ) + for transformation_function in feature_view_obj.transformation_functions: + transformation_function.hopsworks_udf.transformation_statistics = ( + stats.feature_descriptive_statistics ) def get_ready_to_use_transformation_fns( diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py index 9e256c322d..9754b96997 100644 --- a/python/hsfs/engine/python.py +++ b/python/hsfs/engine/python.py @@ -897,7 +897,7 @@ def get_training_data( read_options=read_options, dataframe_type=dataframe_type ) # TODO : Add statistics - transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions( + transformation_function_engine.TransformationFunctionEngine.add_feature_statistics( training_dataset_obj, feature_view_obj, df ) return self._apply_transformation_function( @@ -972,7 +972,7 @@ def _prepare_transform_split_df( # apply transformations # 1st parametrise transformation functions with dt split stats - transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions( + transformation_function_engine.TransformationFunctionEngine.add_feature_statistics( training_dataset_obj, feature_view_obj, result_dfs ) # and the apply them @@ -1260,7 +1260,7 @@ def _apply_transformation_function( dataset = pd.concat( [ dataset, - transformation_function.hopsworks_udf.get_udf(statistics=None)( + transformation_function.hopsworks_udf.get_udf()( *( [ dataset[feature] diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py index 3e7e40a54a..74c1a833e9 100644 --- a/python/hsfs/engine/spark.py +++ b/python/hsfs/engine/spark.py @@ -603,7 +603,7 @@ def write_training_dataset( else: raise ValueError("Dataset should be a query.") - transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions( + transformation_function_engine.TransformationFunctionEngine.add_feature_statistics( training_dataset, feature_view_obj, dataset ) if training_dataset.coalesce: @@ -629,7 +629,7 @@ def write_training_dataset( split_dataset[key] = split_dataset[key].cache() - transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions( + transformation_function_engine.TransformationFunctionEngine.add_feature_statistics( training_dataset, feature_view_obj, split_dataset ) return self._write_training_dataset_splits( @@ -1190,7 +1190,7 @@ def _apply_transformation_function( ) # TODO : Add statistics - pandas_udf = hopsworks_udf.get_udf(None) + pandas_udf = hopsworks_udf.get_udf() transformations.append(pandas_udf) transformation_features.append(hopsworks_udf.transformation_features) diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py index 82e45e4b2c..78e2101ed0 100644 --- a/python/hsfs/feature_view.py +++ b/python/hsfs/feature_view.py @@ -3412,6 +3412,12 @@ def from_response_json(cls, json_dict: Dict[str, Any]) -> "FeatureView": description=json_decamelized.get("description", None), featurestore_name=json_decamelized.get("featurestore_name", None), serving_keys=serving_keys, + transformation_functions=[ + TransformationFunction.from_response_json(transformation) + for transformation in json_decamelized.get( + "transformation_functions", [] + ) + ], ) features = json_decamelized.get("features", []) if features: @@ -3444,6 +3450,7 @@ def update_from_response_json(self, json_dict: Dict[str, Any]) -> "FeatureView": "labels", "inference_helper_columns", "training_helper_columns", + "transformation_functions", "schema", "serving_keys", ]: @@ -3483,6 +3490,7 @@ def to_dict(self) -> Dict[str, Any]: "description": self._description, "query": self._query, "features": self._features, + "transformation_functions": self._transformation_functions, "type": "featureViewDTO", } diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index b2a8bae274..5abaccedbe 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -16,11 +16,15 @@ import ast import inspect +import json import warnings -from typing import Callable, List, Union +from dataclasses import dataclass +from typing import Any, Callable, Dict, List, Optional, Union -from hsfs import engine +import humps +from hsfs import engine, util from hsfs.client.exceptions import FeatureStoreException +from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics def hopsworks_udf(return_type: Union[List[type], type]): @@ -31,11 +35,24 @@ def wrapper(func: Callable): return wrapper +@dataclass +class TransformationFeature: + feature_name: str + statistic_argument_name: Optional[str] + + def to_dict(self): + return { + "feature_name": self.feature_name, + "statistic_argument_name": self.statistic_argument_name, + } + + class HopsworksUdf: """ Metadata class to store information about UDF """ + # TODO : Complete this PYTHON_SPARK_TYPE_MAPPING = { str: "string", int: "int", @@ -46,21 +63,40 @@ class HopsworksUdf: # "binary": BinaryType(), } + STRING_PYTHON_TYPES_MAPPING = {"str": str, "int": int, "float": float, "bool": bool} + def __init__( - self, func: Callable, return_type: Union[List[type], type], name: str = None + self, + func: Union[Callable, str], + return_type: Union[List[type], type], + name: str = None, + transformation_features: List[TransformationFeature] = None, ): - self.udf_function: Callable = func if name is None: - self.function_name: str = func.__name__ + self._function_name: str = func.__name__ else: - self.function_name: str = name - self.return_type: Union[List[type], type] = return_type - self.function_source: str = self._remove_argument( - HopsworksUdf._extract_source_code(self.udf_function), "statistics" - ) - # TODO : Add a getter functions - self.transformation_features: List[str] = ( - HopsworksUdf._extract_function_arguments(self.function_source) + self._function_name: str = name + + self._statistics: Optional[Dict[str, FeatureDescriptiveStatistics]] = dict() + + self._return_type: Union[List[type], type] = return_type + + if isinstance(func, Callable): + self._function_source: str = HopsworksUdf._extract_source_code(func) + else: + self._function_source: str = func + + if transformation_features: + self._transformation_features: List[TransformationFeature] = ( + transformation_features + ) + else: + self._transformation_features: List[TransformationFeature] = ( + HopsworksUdf._extract_function_arguments(self.function_source) + ) + + self._function_source = self._remove_argument( + self.function_source, "statistics" ) HopsworksUdf.validate_arguments(self.return_type) @@ -118,10 +154,6 @@ def _get_module_path(module): @staticmethod def _extract_source_code(udf_function): - if not callable(udf_function): - # TODO : Think about a better text for the raised error - raise ValueError("transformation function must be callable") - try: module_imports = HopsworksUdf._get_module_imports( HopsworksUdf._get_module_path(udf_function.__module__) @@ -153,8 +185,16 @@ def _extract_function_arguments(source_code): # Parse the function signature to remove the specified argument signature = source_code[signature_line] arg_list = signature.split("(")[1].split(")")[0].split(",") - arg_list = [arg.strip() for arg in arg_list] - return arg_list + + arg_list = [arg.split(":")[0].strip() for arg in arg_list] + + return [ + TransformationFeature( + arg, f"statistics_{arg}" if f"statistics_{arg}" in arg_list else None + ) + for arg in arg_list + if not arg.startswith("statistics") + ] def _remove_argument(self, source_code: str, arg_to_remove: str): """ " @@ -181,6 +221,7 @@ def _remove_argument(self, source_code: str, arg_to_remove: str): if ( arg_to_remove not in list(map(str.strip, arg.split(" "))) and arg_to_remove not in list(map(str.strip, arg.split(":"))) + and arg_to_remove not in list(map(str.strip, arg.split("_"))) and arg.strip() != arg_to_remove ) ] @@ -218,47 +259,149 @@ def create_pandas_udf_return_schema_from_list(self, return_types: List[type]): ] ) - def hopsworksUdf_wrapper(self, **statistics): + def hopsworksUdf_wrapper(self): # TODO : clean this up + function_source = "\t".join(self.function_source.splitlines(True)) if isinstance(self.return_type, List): - self.function_source = "\t".join(self.function_source.splitlines(True)) - self.code = f"""def renaming_wrapper(*args): + code = f"""def renaming_wrapper(*args): import pandas as pd - {self.function_source} + {function_source} df = {self.function_name}(*args) df = df.rename(columns = {{f'{{df.columns[i]}}':f'{self.function_name}<{"-".join(self.transformation_features)}>{{i}}' for i in range(len(df.columns))}}) return df""" else: - self.code = self.function_source + code = f"""def renaming_wrapper(*args): + import pandas as pd + {function_source} + df = {self.function_name}(*args) + df = df.rename(f'{self.function_name}<{"-".join(self.transformation_features)}>') + return df""" scope = __import__("__main__").__dict__ - scope.update(**statistics) - exec(self.code, scope) - if isinstance(self.transformation_features, List): - return eval("renaming_wrapper", scope) - else: - return eval(self.function_name, scope) + scope.update(self.transformation_statistics) + exec(code, scope) + return eval("renaming_wrapper", scope) def __call__(self, *args: List[str]): # TODO : Raise an execption if the number of features are incorrect. + if len(args) != len(self.transformation_features): + raise FeatureStoreException( + "Number of features provided does not match the number of features provided in the UDF definition" + ) for arg in args: if not isinstance(arg, str): raise FeatureStoreException( f'Feature names provided must be string "{arg}" is not string' ) - self.transformation_features = list(args) + self._transformation_features = [ + TransformationFeature( + new_feature_name, transformation_feature.statistic_argument_name + ) + for transformation_feature, new_feature_name in zip( + self._transformation_features, args + ) + ] return self - def get_udf(self, statistics): + def get_udf(self): if engine.get_type() in ["hive", "python", "training"]: - return self.hopsworksUdf_wrapper(statistics=statistics) + return self.hopsworksUdf_wrapper() else: from pyspark.sql.functions import pandas_udf # TODO : Make this proper return pandas_udf( - f=self.hopsworksUdf_wrapper(statistics=statistics), + f=self.hopsworksUdf_wrapper(), returnType=self.create_pandas_udf_return_schema_from_list( self.return_type ), ) + + def to_dict(self): + return { + "func": self.function_source, + "name": self.function_name, + "return_type": [python_type.__name__ for python_type in self.return_type] + if isinstance(self.return_type, List) + else self.return_type.__name__, + "transformation_features": self.transformation_features, + } + + def json(self) -> str: + return json.dumps(self, cls=util.FeatureStoreEncoder) + + @classmethod + def from_response_json( + cls: "HopsworksUdf", json_dict: Dict[str, Any] + ) -> "HopsworksUdf": + json_decamelized = humps.decamelize(json_dict) + function_source_code = json_decamelized["func"] + function_name = json_decamelized["name"] + return_type = json_decamelized["return_type"] + transformation_features = json_decamelized["transformation_features"] + + hopsworks_udf = cls( + func=function_source_code, + return_type=[ + cls.STRING_PYTHON_TYPES_MAPPING[python_type] + for python_type in return_type + ] + if isinstance(return_type, List) + else cls.STRING_PYTHON_TYPES_MAPPING[return_type], + name=function_name, + transformation_features=transformation_features, + ) + + return hopsworks_udf + + @property + def return_type(self): + return self._return_type + + @property + def function_name(self): + return self._function_name + + @property + def function_source(self): + return self._function_source + + @property + def statistics_required(self): + return bool(self.statistics_features) + + @property + def transformation_statistics(self): + return self._statistics + + @property + def transformation_features(self): + return [ + transformation_feature.feature_name + for transformation_feature in self._transformation_features + ] + + @property + def statistics_features(self): + return [ + transformation_feature.feature_name + for transformation_feature in self._transformation_features + if transformation_feature.statistic_argument_name is not None + ] + + @property + def statistics_argument_mapping(self): + return { + transformation_feature.feature_name: transformation_feature.statistic_argument_name + for transformation_feature in self._transformation_features + } + + @transformation_statistics.setter + def transformation_statistics(self, statistics: List[FeatureDescriptiveStatistics]): + # TODO : Clean this up + self._statistics = dict() + for stat in statistics: + if stat.feature_name in self.statistics_argument_mapping.keys(): + self._statistics[ + self.statistics_argument_mapping[stat.feature_name] + ] = stat diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py index fee9f1f41e..a1549e50b3 100644 --- a/python/hsfs/transformation_function.py +++ b/python/hsfs/transformation_function.py @@ -15,16 +15,13 @@ from __future__ import annotations import json -from typing import TYPE_CHECKING, Optional +from typing import Optional import humps from hsfs import util from hsfs.core import transformation_function_engine from hsfs.decorators import typechecked - - -if TYPE_CHECKING: - from hsfs.hopsworks_udf import HopsworksUdf +from hsfs.hopsworks_udf import HopsworksUdf @typechecked @@ -108,6 +105,12 @@ def plus_one(value): @classmethod def from_response_json(cls, json_dict): json_decamelized = humps.decamelize(json_dict) + + if json_decamelized.get("hopsworks_udf", False): + json_decamelized["hopsworks_udf"] = HopsworksUdf.from_response_json( + json_decamelized["hopsworks_udf"] + ) + if "count" in json_decamelized: if json_decamelized["count"] == 0: return [] @@ -128,9 +131,8 @@ def to_dict(self): "id": self._id, "name": self._name, "version": self._version, - "sourceCodeContent": self._source_code_content, - "outputType": self._output_type, "featurestoreId": self._featurestore_id, + "hopsworks_udf": self._hopsworks_udf, } @property From 3ac5b26302ca7dbb42d50972ad1c058d6f02bc59 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Thu, 25 Apr 2024 10:53:02 +0200 Subject: [PATCH 05/58] basic functionality working with backend --- python/hsfs/engine/spark.py | 15 ++++-- python/hsfs/feature_view.py | 2 +- python/hsfs/hopsworks_udf.py | 71 ++++++++++++++------------ python/hsfs/transformation_function.py | 2 +- 4 files changed, 49 insertions(+), 41 deletions(-) diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py index 74c1a833e9..38867ea81e 100644 --- a/python/hsfs/engine/spark.py +++ b/python/hsfs/engine/spark.py @@ -1172,6 +1172,7 @@ def _apply_transformation_function( transformed_features = set() transformations = [] transformation_features = [] + output_col_names = [] explode_name = [] for transformation_function in transformation_functions: hopsworks_udf = transformation_function.hopsworks_udf @@ -1191,13 +1192,15 @@ def _apply_transformation_function( # TODO : Add statistics pandas_udf = hopsworks_udf.get_udf() + output_col_name = f'{hopsworks_udf.function_name}<{"-".join(hopsworks_udf.transformation_features)}>' transformations.append(pandas_udf) transformation_features.append(hopsworks_udf.transformation_features) + output_col_names.append(output_col_name) if isinstance(hopsworks_udf.return_type, List): - explode_name.append( - f'{pandas_udf.__name__}({", ".join(hopsworks_udf.transformation_features)}).*' - ) + explode_name.append(f"{output_col_name}.*") + else: + explode_name.append(output_col_name) def timezone_decorator(func, trans_fn=hopsworks_udf): if trans_fn.output_type != "TIMESTAMP": @@ -1236,8 +1239,10 @@ def decorated_func(x): transformed_dataset = dataset.select( *untransformed_columns, *[ - fun(*feature) - for fun, feature in zip(transformations, transformation_features) + fun(*feature).alias(output_col_name) + for fun, feature, output_col_name in zip( + transformations, transformation_features, output_col_names + ) ], ).select(*untransformed_columns, *explode_name) diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py index 78e2101ed0..386e3b256f 100644 --- a/python/hsfs/feature_view.py +++ b/python/hsfs/feature_view.py @@ -3490,7 +3490,7 @@ def to_dict(self) -> Dict[str, Any]: "description": self._description, "query": self._query, "features": self._features, - "transformation_functions": self._transformation_functions, + "transformationFunctions": self._transformation_functions, "type": "featureViewDTO", } diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index 5abaccedbe..d23e1f45d2 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -15,9 +15,9 @@ # import ast +import copy import inspect import json -import warnings from dataclasses import dataclass from typing import Any, Callable, Dict, List, Optional, Union @@ -56,7 +56,7 @@ class HopsworksUdf: PYTHON_SPARK_TYPE_MAPPING = { str: "string", int: "int", - float: "float", + float: "double", # "timestamp": TimestampType(), bool: "boolean", # "date": DateType(), @@ -161,10 +161,10 @@ def _extract_source_code(udf_function): except Exception: module_imports = "" # TODO : Check if warning is actually required. - warnings.warn( - "Passed UDF defined in a Jupyter notebook. Cannot extract dependices from a notebook. Please make sure to import all dependcies for the UDF inside the code.", - stacklevel=2, - ) + # warnings.warn( + # "Passed UDF defined in a Jupyter notebook. Cannot extract dependices from a notebook. Please make sure to import all dependcies for the UDF inside the code.", + # stacklevel=2, + # ) function_code = inspect.getsource(udf_function) source_code = "\n".join(module_imports) + "\n" + function_code @@ -206,14 +206,18 @@ def _remove_argument(self, source_code: str, arg_to_remove: str): # Get source code of the original function source_code = source_code.split("\n") + signature_start_line = None + signature_end_line = None # Find the line where the function signature is defined for i, line in enumerate(source_code): if line.strip().startswith("def "): - signature_line = i + signature_start_line = i + if signature_start_line is not None and ")" in line: + signature_end_line = i break # Parse the function signature to remove the specified argument - signature = source_code[signature_line] + signature = "".join(source_code[signature_start_line : signature_end_line + 1]) arg_list = signature.split("(")[1].split(")")[0].split(",") arg_list = [ arg.split(":")[0].strip() @@ -234,15 +238,10 @@ def _remove_argument(self, source_code: str, arg_to_remove: str): + ")" + signature.split(")")[1] ) - - # Modify the source code to reflect the changes - source_code[signature_line] = new_signature - - # Removing test before function signatre since they are decorators - source_code = source_code[signature_line:] - # Reconstruct the modified function as a string - modified_source = "\n".join(source_code) + modified_source = ( + new_signature + "\n" + "\n".join(source_code[signature_end_line + 1 :]) + ) # Define a new function with the modified source code return modified_source @@ -252,12 +251,15 @@ def get_spark_type(python_type: type): return HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[python_type] def create_pandas_udf_return_schema_from_list(self, return_types: List[type]): - return ", ".join( - [ - f'`{self.function_name}<{"-".join(self.transformation_features)}>{i}` {HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[return_types[i]]}' - for i in range(len(return_types)) - ] - ) + if isinstance(return_types, List): + return ", ".join( + [ + f'`{self.function_name}<{"-".join(self.transformation_features)}>{i}` {HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[return_types[i]]}' + for i in range(len(return_types)) + ] + ) + else: + return f"{HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[return_types]}" def hopsworksUdf_wrapper(self): # TODO : clean this up @@ -292,8 +294,10 @@ def __call__(self, *args: List[str]): raise FeatureStoreException( f'Feature names provided must be string "{arg}" is not string' ) - - self._transformation_features = [ + udf = copy.deepcopy( + self + ) # TODO : Clean this copy is needed so that if the uses the same function to multiple feature, if copy not done then all variable would share the same traanformation feature, + udf._transformation_features = [ TransformationFeature( new_feature_name, transformation_feature.statistic_argument_name ) @@ -301,7 +305,7 @@ def __call__(self, *args: List[str]): self._transformation_features, args ) ] - return self + return udf def get_udf(self): if engine.get_type() in ["hive", "python", "training"]: @@ -319,12 +323,12 @@ def get_udf(self): def to_dict(self): return { - "func": self.function_source, - "name": self.function_name, - "return_type": [python_type.__name__ for python_type in self.return_type] + "sourceCode": self.function_source, + "outputTypes": [python_type.__name__ for python_type in self.return_type] if isinstance(self.return_type, List) else self.return_type.__name__, - "transformation_features": self.transformation_features, + "transformationFeatures": self.transformation_features, + "name": self._function_name, } def json(self) -> str: @@ -335,10 +339,10 @@ def from_response_json( cls: "HopsworksUdf", json_dict: Dict[str, Any] ) -> "HopsworksUdf": json_decamelized = humps.decamelize(json_dict) - function_source_code = json_decamelized["func"] + function_source_code = json_decamelized["source_code"] function_name = json_decamelized["name"] - return_type = json_decamelized["return_type"] - transformation_features = json_decamelized["transformation_features"] + return_type = json_decamelized["output_types"] + transformation_features = json_decamelized["transformation_features"].split(",") hopsworks_udf = cls( func=function_source_code, @@ -349,10 +353,9 @@ def from_response_json( if isinstance(return_type, List) else cls.STRING_PYTHON_TYPES_MAPPING[return_type], name=function_name, - transformation_features=transformation_features, ) - return hopsworks_udf + return hopsworks_udf(*transformation_features) @property def return_type(self): diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py index a1549e50b3..a731d604e1 100644 --- a/python/hsfs/transformation_function.py +++ b/python/hsfs/transformation_function.py @@ -132,7 +132,7 @@ def to_dict(self): "name": self._name, "version": self._version, "featurestoreId": self._featurestore_id, - "hopsworks_udf": self._hopsworks_udf, + "hopsworksUdf": self._hopsworks_udf, } @property From df5c9695e433f5b56409007f3726541f386a504e Mon Sep 17 00:00:00 2001 From: manu-sj Date: Thu, 25 Apr 2024 12:13:25 +0200 Subject: [PATCH 06/58] code with statistics working and saved to backend --- python/hsfs/hopsworks_udf.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index d23e1f45d2..9ec62200a9 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -86,6 +86,9 @@ def __init__( else: self._function_source: str = func + # TODO : Must clean this up : [Store actual source code]. The actual code without any clean up should be stored in the backed and the cleaned source must be dynamically build up when the data is read from the backend. + self._original_code = self._function_source + if transformation_features: self._transformation_features: List[TransformationFeature] = ( transformation_features @@ -264,7 +267,9 @@ def create_pandas_udf_return_schema_from_list(self, return_types: List[type]): def hopsworksUdf_wrapper(self): # TODO : clean this up function_source = "\t".join(self.function_source.splitlines(True)) - if isinstance(self.return_type, List): + if ( + isinstance(self.return_type, List) and len(self.return_type) > 1 + ): # TODO : This check must be cleaned up for sure code = f"""def renaming_wrapper(*args): import pandas as pd {function_source} @@ -323,8 +328,10 @@ def get_udf(self): def to_dict(self): return { - "sourceCode": self.function_source, - "outputTypes": [python_type.__name__ for python_type in self.return_type] + "sourceCode": self._original_code, + "outputTypes": ",".join( + [python_type.__name__ for python_type in self.return_type] + ) if isinstance(self.return_type, List) else self.return_type.__name__, "transformationFeatures": self.transformation_features, @@ -341,7 +348,7 @@ def from_response_json( json_decamelized = humps.decamelize(json_dict) function_source_code = json_decamelized["source_code"] function_name = json_decamelized["name"] - return_type = json_decamelized["output_types"] + return_type = json_decamelized["output_types"].split(",") transformation_features = json_decamelized["transformation_features"].split(",") hopsworks_udf = cls( @@ -354,7 +361,6 @@ def from_response_json( else cls.STRING_PYTHON_TYPES_MAPPING[return_type], name=function_name, ) - return hopsworks_udf(*transformation_features) @property From 2e9aa72a07af9ef25a5b7302088bf19f7bced7be Mon Sep 17 00:00:00 2001 From: manu-sj Date: Mon, 29 Apr 2024 16:43:27 +0200 Subject: [PATCH 07/58] working code for feature vector --- python/hsfs/core/feature_view_api.py | 17 ++-- .../core/transformation_function_engine.py | 82 +------------------ python/hsfs/core/vector_server.py | 37 +++++++-- python/hsfs/hopsworks_udf.py | 16 +++- python/hsfs/transformation_function.py | 38 +++++---- 5 files changed, 75 insertions(+), 115 deletions(-) diff --git a/python/hsfs/core/feature_view_api.py b/python/hsfs/core/feature_view_api.py index ed5a8468c3..6ff621c7db 100644 --- a/python/hsfs/core/feature_view_api.py +++ b/python/hsfs/core/feature_view_api.py @@ -17,12 +17,7 @@ from typing import List, Optional, Union -from hsfs import ( - client, - feature_view, - training_dataset, - transformation_function_attached, -) +from hsfs import client, feature_view, training_dataset, transformation_function from hsfs.client.exceptions import RestAPIError from hsfs.constructor import query, serving_prepared_statement from hsfs.core import explicit_provenance, job, training_dataset_job_conf @@ -102,7 +97,9 @@ def get_by_name_version(self, name: str, version: int) -> feature_view.FeatureVi try: return feature_view.FeatureView.from_response_json( self._client._send_request( - self._GET, path, {"expand": ["query", "features"]} + self._GET, + path, + {"expand": ["query", "features", "transformationfunctions"]}, ) ) except RestAPIError as e: @@ -183,11 +180,11 @@ def get_serving_prepared_statement( def get_attached_transformation_fn( self, name: str, version: int ) -> Union[ - "transformation_function_attached.TransformationFunctionAttached", - List["transformation_function_attached.TransformationFunctionAttached"], + "transformation_function.TransformationFunction", + List["transformation_function.TransformationFunction"], ]: path = self._base_path + [name, self._VERSION, version, self._TRANSFORMATION] - return transformation_function_attached.TransformationFunctionAttached.from_response_json( + return transformation_function.TransformationFunction.from_response_json( self._client._send_request("GET", path) ) diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py index 492567e6ec..e99b79672a 100644 --- a/python/hsfs/core/transformation_function_engine.py +++ b/python/hsfs/core/transformation_function_engine.py @@ -21,6 +21,7 @@ import hsfs import numpy + from hsfs import ( feature_view, statistics, @@ -37,7 +38,6 @@ ) from hsfs.core.builtin_transformation_function import BuiltInTransformationFunction - class TransformationFunctionEngine: BUILTIN_FN_NAMES = [ "min_max_scaler", @@ -62,16 +62,7 @@ def __init__(self, feature_store_id: int): self._feature_view_api: Optional["feature_view_api.FeatureViewApi"] = None self._statistics_engine: Optional["statistics_engine.StatisticsEngine"] = None - def save(self, transformation_fn_instance): - if self.is_builtin(transformation_fn_instance): - raise ValueError( - "Transformation function name '{name:}' with version 1 is reserved for built-in hsfs " - "functions. Please use other name or version".format( - name=transformation_fn_instance.name - ) - ) - if not callable(transformation_fn_instance.transformation_fn): - raise ValueError("transformer must be callable") + def save(self, transformation_fn_instance: TransformationFunction): self._transformation_function_api.register_transformation_fn( transformation_fn_instance ) @@ -109,73 +100,6 @@ def get_td_transformation_fn(self, training_dataset): ) return transformation_fn_dict - def is_builtin(self, transformation_fn_instance): - return ( - transformation_fn_instance.name in self.BUILTIN_FN_NAMES - and transformation_fn_instance.version == 1 - ) - - @staticmethod - def populate_builtin_fn_arguments( - feature_name, transformation_function_instance, feature_descriptive_stats - ): - # TODO : Make this statistics - if transformation_function_instance.name == "min_max_scaler": - min_value, max_value = BuiltInTransformationFunction.min_max_scaler_stats( - feature_descriptive_stats, feature_name - ) - transformation_function_instance.transformation_fn = partial( - transformation_function_instance.transformation_fn, - min_value=min_value, - max_value=max_value, - ) - elif transformation_function_instance.name == "standard_scaler": - mean, std_dev = BuiltInTransformationFunction.standard_scaler_stats( - feature_descriptive_stats, feature_name - ) - transformation_function_instance.transformation_fn = partial( - transformation_function_instance.transformation_fn, - mean=mean, - std_dev=std_dev, - ) - elif transformation_function_instance.name == "robust_scaler": - robust_scaler_stats = BuiltInTransformationFunction.robust_scaler_stats( - feature_descriptive_stats, feature_name - ) - transformation_function_instance.transformation_fn = partial( - transformation_function_instance.transformation_fn, - p25=robust_scaler_stats[24], - p50=robust_scaler_stats[49], - p75=robust_scaler_stats[74], - ) - elif transformation_function_instance.name == "label_encoder": - value_to_index = BuiltInTransformationFunction.encoder_stats( - feature_descriptive_stats, feature_name - ) - transformation_function_instance.transformation_fn = partial( - transformation_function_instance.transformation_fn, - value_to_index=value_to_index, - ) - else: - raise ValueError("Not implemented") - - return transformation_function_instance - - def populate_builtin_attached_fns( - self, attached_transformation_fns, feature_descriptive_stats - ): - # TODO : Remove - for ft_name in attached_transformation_fns: - if self.is_builtin(attached_transformation_fns[ft_name]): - # check if its built-in transformation function and populated with statistics arguments - transformation_fn = self.populate_builtin_fn_arguments( - ft_name, - attached_transformation_fns[ft_name], - feature_descriptive_stats, - ) - attached_transformation_fns[ft_name] = transformation_fn - return attached_transformation_fns - @staticmethod def infer_spark_type(output_type): # TODO : Move to hopsworks_udf @@ -217,6 +141,8 @@ def infer_spark_type(output_type): else: raise TypeError("Not supported type %s." % output_type) + # TODO : about statistics computation and fetching. + # TODO : Think about what to do with label encoder features. @staticmethod def compute_transformation_fn_statistics( diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py index ae35f326b8..2ed6d8688f 100755 --- a/python/hsfs/core/vector_server.py +++ b/python/hsfs/core/vector_server.py @@ -103,6 +103,7 @@ def __init__( self._inference_helper_col_name = [ feat.name for feat in features if feat.inference_helper_column ] + self._transformed_feature_vector_col_name = None self._skip_fg_ids = skip_fg_ids or set() self._serving_keys = serving_keys or [] @@ -125,7 +126,7 @@ def __init__( def init_serving( self, - entity: Union[feature_view.FeatureView, training_dataset.TrainingDataset], + entity: Union[feature_view.FeatureView], external: Optional[bool] = None, inference_helper_columns: bool = False, options: Optional[Dict[str, Any]] = None, @@ -573,6 +574,7 @@ def get_inference_helpers( batch_results, batch=True, inference_helper=True, return_type=return_type ) + def which_client_and_ensure_initialised( self, force_rest_client: bool, force_sql_client: bool ) -> str: @@ -634,15 +636,23 @@ def _set_default_client( self.default_client = self.DEFAULT_SQL_CLIENT self._init_sql_client = True - def apply_transformation(self, row_dict: Dict[str, Any]): - matching_keys = set(self.transformation_functions.keys()).intersection( - row_dict.keys() - ) + def apply_transformation(self, row_dict: dict): _logger.debug("Applying transformation functions to : %s", matching_keys) - for feature_name in matching_keys: - row_dict[feature_name] = self.transformation_functions[ - feature_name - ].transformation_fn(row_dict[feature_name]) + for transformation_function in self.transformation_functions: + features = [ + pd.Series(row_dict[feature]) + for feature in transformation_function.hopsworks_udf.transformation_features + ] + transformed_result = transformation_function.hopsworks_udf.get_udf()( + *features + ) + if isinstance(transformed_result, pd.Series): + row_dict[transformed_result.name] = transformed_result.values[0] + else: + for col in transformed_result: + row_dict[transformed_result.name] = transformed_result[col].values[ + 0 + ] return row_dict def apply_return_value_handlers( @@ -1064,3 +1074,12 @@ def default_client(self, default_client: Literal["rest", "sql"]): _logger.debug(f"Default Online Store Client is set to {default_client}.") self._default_client = default_client + + def transformed_feature_vector_col_name(self): + if self._transformed_feature_vector_col_name is None: + for transformation_function in self._transformation_functions: + self._transformed_feature_vector_col_name = ( + self._feature_vector_col_name + + transformation_function.hopsworks_udf.transformation_feature_names + ) + return self._transformed_feature_vector_col_name \ No newline at end of file diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index 9ec62200a9..d69af6c863 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -361,7 +361,11 @@ def from_response_json( else cls.STRING_PYTHON_TYPES_MAPPING[return_type], name=function_name, ) - return hopsworks_udf(*transformation_features) + # TODO : Write proper comments for this use case. If we get a transformation function saved in the feature store then it will not have any specific transformaiton feature other than the ones in the code. + if "" not in transformation_features: + return hopsworks_udf(*transformation_features) + else: + return hopsworks_udf @property def return_type(self): @@ -414,3 +418,13 @@ def transformation_statistics(self, statistics: List[FeatureDescriptiveStatistic self._statistics[ self.statistics_argument_mapping[stat.feature_name] ] = stat + + @property + def transformation_feature_names(self) -> List[str]: + if isinstance(self.return_type, List) and len(self.return_type) > 1: + return [ + f'{self.function_name}<{"-".join(self.transformation_features)}>{{i}}' + for i in range(len(self.return_type)) + ] + else: + return [f'{self.function_name}<{"-".join(self.transformation_features)}>'] diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py index a731d604e1..270027a743 100644 --- a/python/hsfs/transformation_function.py +++ b/python/hsfs/transformation_function.py @@ -15,10 +15,11 @@ from __future__ import annotations import json -from typing import Optional +from typing import List, Optional import humps from hsfs import util +from hsfs.client.exceptions import FeatureStoreException from hsfs.core import transformation_function_engine from hsfs.decorators import typechecked from hsfs.hopsworks_udf import HopsworksUdf @@ -48,8 +49,11 @@ def __init__( self._featurestore_id ) ) + if not isinstance(hopsworks_udf, HopsworksUdf): + raise FeatureStoreException( + "Use hopsworks_udf decorator when creating the feature view." + ) self._hopsworks_udf = hopsworks_udf - self._name = hopsworks_udf.function_name self._feature_group_feature_name: Optional[str] = None self._feature_group_id: Optional[int] = None @@ -102,20 +106,29 @@ def plus_one(value): """ self._transformation_function_engine.delete(self) + def __call__(self, *args: List[str]): + self._hopsworks_udf = self._hopsworks_udf(*args) + return self + @classmethod def from_response_json(cls, json_dict): json_decamelized = humps.decamelize(json_dict) - - if json_decamelized.get("hopsworks_udf", False): - json_decamelized["hopsworks_udf"] = HopsworksUdf.from_response_json( - json_decamelized["hopsworks_udf"] - ) - + print(json_decamelized) + # TODO : Clean this up. if "count" in json_decamelized: if json_decamelized["count"] == 0: return [] + for tffn_dto in json_decamelized["items"]: + if tffn_dto.get("hopsworks_udf", False): + tffn_dto["hopsworks_udf"] = HopsworksUdf.from_response_json( + tffn_dto["hopsworks_udf"] + ) return [cls(**tffn_dto) for tffn_dto in json_decamelized["items"]] else: + if json_decamelized.get("hopsworks_udf", False): + json_decamelized["hopsworks_udf"] = HopsworksUdf.from_response_json( + json_decamelized["hopsworks_udf"] + ) return cls(**json_decamelized) def update_from_response_json(self, json_dict): @@ -129,7 +142,6 @@ def json(self): def to_dict(self): return { "id": self._id, - "name": self._name, "version": self._version, "featurestoreId": self._featurestore_id, "hopsworksUdf": self._hopsworks_udf, @@ -144,10 +156,6 @@ def id(self) -> id: def id(self, id: int): self._id = id - @property - def name(self) -> str: - return self._name - @property def version(self) -> int: return self._version @@ -156,10 +164,6 @@ def version(self) -> int: def hopsworks_udf(self) -> HopsworksUdf: return self._hopsworks_udf - @name.setter - def name(self, name: str): - self._name = name - @version.setter def version(self, version: int): self._version = version From fceb9b55e9f204eaa2e39048be29b6849244665c Mon Sep 17 00:00:00 2001 From: manu-sj Date: Thu, 2 May 2024 14:37:55 +0200 Subject: [PATCH 08/58] reformatted and documented Hopswork UDF class --- python/hsfs/hopsworks_udf.py | 532 ++++++++++++++++++++++++----------- 1 file changed, 368 insertions(+), 164 deletions(-) diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index d69af6c863..6544e9fdd5 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -18,8 +18,10 @@ import copy import inspect import json +import warnings from dataclasses import dataclass -from typing import Any, Callable, Dict, List, Optional, Union +from datetime import date, datetime, time +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import humps from hsfs import engine, util @@ -27,9 +29,37 @@ from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics -def hopsworks_udf(return_type: Union[List[type], type]): +def hopsworks_udf(output_type: Union[List[type], type]): + """ + Create an User Defined Function that can be and used within the Hopsworks Feature Store. + + Hopsworks UDF's are user defined functions that executes as 'pandas_udf' when executing + in spark engine and as pandas functions in the python engine. A Hopsworks udf is defined + using the `hopsworks_udf` decorator. The outputs of the defined UDF must be mentioned in the + decorator as a list of python types. + + + !!! example + ```python + from hsfs.hopsworks_udf import hopsworks_udf + + @hopsworks_udf(float) + def add_one(data1 : pd.Series): + return data1 + 1 + ``` + + # Arguments + output_type: `list`. The output types of the defined UDF + + # Returns + `HopsworksUdf`: The metadata object for hopsworks UDF's. + + # Raises + `hsfs.client.exceptions.FeatureStoreException` : If unable to create UDF. + """ + def wrapper(func: Callable): - udf = HopsworksUdf(func=func, return_type=return_type) + udf = HopsworksUdf(func=func, output_type=output_type) return udf return wrapper @@ -37,6 +67,17 @@ def wrapper(func: Callable): @dataclass class TransformationFeature: + """ + Mapping of feature names to their corresponding statistics argument names in the code. + + The statistic_argument_name for a feature name would be None if the feature does not need statistics. + + Attributes + ---------- + feature_name (str) : Name of the feature. + statistic_argument_name (str) : Name of the statistics argument in the code for the feature specified in the feature name. + """ + feature_name: str statistic_argument_name: Optional[str] @@ -49,83 +90,111 @@ def to_dict(self): class HopsworksUdf: """ - Metadata class to store information about UDF + Meta data for user defined functions. + + Stores meta data required to execute the user defined function in both spark and python engine. + The class generates uses the metadata to dynamically generate user defined functions based on the + engine it is executed in. + + Attributes + ---------- + output_type (List[str]) : Output types of the columns returned from the UDF. + function_name (str) : Name of the UDF + statistics_required (bool) : True if statistics is required for any of the parameters of the UDF. + transformation_statistics (Dict[str, FeatureDescriptiveStatistics]): Dictionary that maps the statistics_argument name in the function to the actual statistics variable. + transformation_features (List[str]) : List of feature names to which the transformation function would be applied. + statistics_features (List[str]) : List of feature names that requires statistics. """ - # TODO : Complete this + # Mapping for converting python types to spark types - required for creating pandas UDF's. PYTHON_SPARK_TYPE_MAPPING = { str: "string", - int: "int", + int: "bigint", float: "double", - # "timestamp": TimestampType(), bool: "boolean", - # "date": DateType(), - # "binary": BinaryType(), + datetime: "timestamp", + time: "timestamp", + date: "date", } - STRING_PYTHON_TYPES_MAPPING = {"str": str, "int": int, "float": float, "bool": bool} - def __init__( self, func: Union[Callable, str], - return_type: Union[List[type], type], - name: str = None, - transformation_features: List[TransformationFeature] = None, + output_type: Union[List[type], type, List[str], str], + name: Optional[str] = None, + transformation_features: Optional[List[TransformationFeature]] = None, ): - if name is None: - self._function_name: str = func.__name__ - else: - self._function_name: str = name - - self._statistics: Optional[Dict[str, FeatureDescriptiveStatistics]] = dict() + self._output_type: List[str] = HopsworksUdf._validate_and_convert_output_types( + output_type + ) - self._return_type: Union[List[type], type] = return_type + self._function_name: str = func.__name__ if name is None else name - if isinstance(func, Callable): - self._function_source: str = HopsworksUdf._extract_source_code(func) - else: - self._function_source: str = func - - # TODO : Must clean this up : [Store actual source code]. The actual code without any clean up should be stored in the backed and the cleaned source must be dynamically build up when the data is read from the backend. - self._original_code = self._function_source + self._function_source: str = ( + HopsworksUdf._extract_source_code(func) + if isinstance(func, Callable) + else func + ) - if transformation_features: - self._transformation_features: List[TransformationFeature] = ( - transformation_features - ) - else: - self._transformation_features: List[TransformationFeature] = ( - HopsworksUdf._extract_function_arguments(self.function_source) - ) + self._transformation_features: List[TransformationFeature] = ( + HopsworksUdf._extract_function_arguments(self._function_source) + if not transformation_features + else transformation_features + ) - self._function_source = self._remove_argument( - self.function_source, "statistics" + self._formatted_function_source = HopsworksUdf._format_source_code( + self._function_source, self._transformation_features ) - HopsworksUdf.validate_arguments(self.return_type) - def get_transformation_features(self): - return self.transformation_features + self._output_column_names: List[str] = self._get_output_column_names() + + self._statistics: Optional[Dict[str, FeatureDescriptiveStatistics]] = None @staticmethod - def validate_arguments(return_type): - if isinstance(return_type, list): - for python_type in return_type: - if not isinstance(python_type, type): - raise FeatureStoreException( - f'Return types provided must be a python type or a list of python types. "{python_type}" is not python type' - ) - else: - if not isinstance(return_type, type): + def _validate_and_convert_output_types( + output_types: Union[List[type], List[str]], + ) -> List[str]: + """ + Function that takes in a type or list of types validates if it is supported and return a list of strings + + # Arguments + output_types: `list`. List of python types. + + # Raises + `hsfs.client.exceptions.FeatureStoreException` : If the any of the output type is invalid + """ + convert_output_types = [] + output_types = ( + output_types if isinstance(output_types, List) else [output_types] + ) + for output_type in output_types: + if ( + output_type not in HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING.keys() + and output_type not in HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING.values() + ): raise FeatureStoreException( - f'Return types provided must be a python type or a list of python types. "{return_type}" is not python type or a list' + f"Output type {output_type} is not supported. Please refer to DOCUMENTATION to get more information on the supported types." ) + convert_output_types.append( + output_type + if isinstance(output_type, str) + else HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[output_type] + ) + return convert_output_types @staticmethod - def _get_module_imports(path): + def _get_module_imports(path: str) -> List[str]: + """Function that extracts the imports used in the python file specified in the path. + + # Arguments + path: `str`. Path to python file from which imports are to be extracted. + + # Returns + `List[str]`: A list of string that contains the import statement using in the file. + """ imports = [] with open(path) as fh: root = ast.parse(fh.read(), path) - for node in ast.iter_child_nodes(root): if isinstance(node, ast.Import): imported_module = False @@ -133,7 +202,6 @@ def _get_module_imports(path): imported_module = node.module else: continue - for n in node.names: if imported_module: import_line = "from " + imported_module + " import " + n.name @@ -145,7 +213,20 @@ def _get_module_imports(path): return imports @staticmethod - def _get_module_path(module_name): + def _get_module_path(module_name: str) -> str: + """ + Function that returns the path to the source code of a python module. + + Cannot extract path if the module is defined in a jupyter notebook since it is currently impossible find the path of a jupyter notebook.(https://github.com/ipython/ipython/issues/10123) + + # Arguments + path: `str`. Path to python file from which imports are to be extracted. + # Raises + AttributeError : If the provided module is defined in a jupyter notebook. + # Returns + `str`: a string that contains the path to the module + """ + def _get_module_path(module): return module.__file__ @@ -156,18 +237,26 @@ def _get_module_path(module): return module_path["path"] @staticmethod - def _extract_source_code(udf_function): + def _extract_source_code(udf_function: Callable) -> str: + """ + Function to extract the source code of the function along with the imports used in the file. + + The module imports cannot be extracted if the function is defined in a jupyter notebook. + + # Arguments + udf_function: `Callable`. Function for which the source code must be extracted. + # Returns + `str`: a string that contains the source code of function along with the extracted module imports. + """ try: module_imports = HopsworksUdf._get_module_imports( HopsworksUdf._get_module_path(udf_function.__module__) ) - except Exception: - module_imports = "" - # TODO : Check if warning is actually required. - # warnings.warn( - # "Passed UDF defined in a Jupyter notebook. Cannot extract dependices from a notebook. Please make sure to import all dependcies for the UDF inside the code.", - # stacklevel=2, - # ) + except AttributeError: + warnings.warn( + "Passed UDF defined in a Jupyter notebook. Cannot extract import dependencies from a notebook. Please make sure to import all dependencies for the UDF inside the function.", + stacklevel=2, + ) function_code = inspect.getsource(udf_function) source_code = "\n".join(module_imports) + "\n" + function_code @@ -175,22 +264,68 @@ def _extract_source_code(udf_function): return source_code @staticmethod - def _extract_function_arguments(source_code): - # Get source code of the original function + def _parse_function_signature(source_code: str) -> Tuple[List[str], str, int, int]: + """ + Function to parse the source code to extract the argument along with the start and end line of the function signature + + # Arguments + source_code: `str`. Source code of a function. + # Returns + `List[str]`: List of function arguments + `str`: function signature + `int`: starting line number of function signature + `int`: ending line number of function signature + + """ source_code = source_code.split("\n") + signature_start_line = None + signature_end_line = None # Find the line where the function signature is defined for i, line in enumerate(source_code): if line.strip().startswith("def "): - signature_line = i + signature_start_line = i + if signature_start_line is not None and ")" in line: + signature_end_line = i break # Parse the function signature to remove the specified argument - signature = source_code[signature_line] + signature = "".join( + [ + code.split("#")[0] + for code in source_code[signature_start_line : signature_end_line + 1] + ] + ) arg_list = signature.split("(")[1].split(")")[0].split(",") + return arg_list, signature, signature_start_line, signature_end_line + + @staticmethod + def _extract_function_arguments(source_code: str) -> List[TransformationFeature]: + """ + Function to extract the argument names from a provided function source code. + + # Arguments + source_code: `str`. Source code of a function. + # Returns + `List[TransformationFeature]`: List of TransformationFeature that provide a mapping from feature names to corresponding statistics parameters if any is present. + """ + # Get source code of the original function + arg_list, _, _, _ = HopsworksUdf._parse_function_signature(source_code) + + if arg_list == [""]: + raise FeatureStoreException( + "No arguments present in the provided user defined function. Please provide at least one argument in the defined user defined function." + ) arg_list = [arg.split(":")[0].strip() for arg in arg_list] + for arg in arg_list: + if arg.startswith("statistics"): + if arg.split("statistics_")[1] not in arg_list: + raise FeatureStoreException( + f"No argument corresponding to statistics parameter '{arg}' present in function definition." + ) + return [ TransformationFeature( arg, f"statistics_{arg}" if f"statistics_{arg}" in arg_list else None @@ -199,152 +334,202 @@ def _extract_function_arguments(source_code): if not arg.startswith("statistics") ] - def _remove_argument(self, source_code: str, arg_to_remove: str): - """ " - Function to remove statistics arguments from passed udf and type hinting. - Statistics arguments are removed since pandas UDF's do not accept extra arguments. - Statistics parameters are dynamically injected into the function scope. + @staticmethod + def _format_source_code( + source_code: str, transformation_features: List[TransformationFeature] + ) -> str: """ + Function that parses the existing source code to remove statistics parameter and remove all decorators and type hints from the function source code. - # Get source code of the original function - source_code = source_code.split("\n") + # Arguments + source_code: `str`. Source code of a function. + transformation_features `List[TransformationFeature]`: List of transformation features provided in the function argument. + # Returns + `str`: Source code that does not contain any decorators, type hints or statistics parameters. + """ - signature_start_line = None - signature_end_line = None - # Find the line where the function signature is defined - for i, line in enumerate(source_code): - if line.strip().startswith("def "): - signature_start_line = i - if signature_start_line is not None and ")" in line: - signature_end_line = i - break + _, signature, _, signature_end_line = HopsworksUdf._parse_function_signature( + source_code + ) - # Parse the function signature to remove the specified argument - signature = "".join(source_code[signature_start_line : signature_end_line + 1]) - arg_list = signature.split("(")[1].split(")")[0].split(",") - arg_list = [ - arg.split(":")[0].strip() - for arg in arg_list - if ( - arg_to_remove not in list(map(str.strip, arg.split(" "))) - and arg_to_remove not in list(map(str.strip, arg.split(":"))) - and arg_to_remove not in list(map(str.strip, arg.split("_"))) - and arg.strip() != arg_to_remove - ) - ] + arg_list = [feature.feature_name for feature in transformation_features] # Reconstruct the function signature new_signature = ( - signature.split("(")[0] - + "(" - + ", ".join(arg_list) - + ")" - + signature.split(")")[1] + signature.split("(")[0].strip() + "(" + ", ".join(arg_list) + "):" ) + source_code = source_code.split("\n") # Reconstruct the modified function as a string modified_source = ( - new_signature + "\n" + "\n".join(source_code[signature_end_line + 1 :]) + new_signature + "\n" + "\n\t".join(source_code[signature_end_line + 1 :]) ) # Define a new function with the modified source code return modified_source - @staticmethod - def get_spark_type(python_type: type): - return HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[python_type] + def _get_output_column_names(self) -> str: + """ + Function that generates feature names for the transformed features + + # Returns + `List[str]`: List of feature names for the transformed columns + """ + if len(self.output_types) > 1: + return [ + f'{self.function_name}_{"-".join(self.transformation_features)}_{i}' + for i in range(len(self.output_types)) + ] + else: + return [f'{self.function_name}_{"-".join(self.transformation_features)}_'] + + def _create_pandas_udf_return_schema_from_list(self) -> str: + """ + Function that creates the return schema required for executing the defined UDF's as pandas UDF's in Spark. - def create_pandas_udf_return_schema_from_list(self, return_types: List[type]): - if isinstance(return_types, List): + # Returns + `str`: DDL-formatted type string that denotes the return types of the user defined function. + """ + if len(self.output_types) > 1: return ", ".join( [ - f'`{self.function_name}<{"-".join(self.transformation_features)}>{i}` {HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[return_types[i]]}' - for i in range(len(return_types)) + f"{self.output_column_names[i]} {self.output_types[i]}" + for i in range(len(self.output_types)) ] ) else: - return f"{HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[return_types]}" - - def hopsworksUdf_wrapper(self): - # TODO : clean this up - function_source = "\t".join(self.function_source.splitlines(True)) - if ( - isinstance(self.return_type, List) and len(self.return_type) > 1 - ): # TODO : This check must be cleaned up for sure + return self.output_types[0] + + def hopsworksUdf_wrapper(self) -> Callable: + """ + Function that creates a dynamic wrapper function for the defined udf that renames the columns output by the UDF into specified column names. + + The renames is done so that the column names match the schema expected by spark when multiple columns are returned in a pandas udf. + The wrapper function would be available in the main scope of the program. + + # Returns + `Callable`: A wrapper function that renames outputs of the User defined function into specified output column names. + """ + # Defining wrapper function that renames the column names to specific names + if len(self.output_types) > 1: code = f"""def renaming_wrapper(*args): import pandas as pd - {function_source} + {self._formatted_function_source} df = {self.function_name}(*args) - df = df.rename(columns = {{f'{{df.columns[i]}}':f'{self.function_name}<{"-".join(self.transformation_features)}>{{i}}' for i in range(len(df.columns))}}) + df = df.rename(columns = {{df.columns[i]: _output_col_names[i] for i in range(len(df.columns))}}) return df""" else: code = f"""def renaming_wrapper(*args): import pandas as pd - {function_source} + {self._formatted_function_source} df = {self.function_name}(*args) - df = df.rename(f'{self.function_name}<{"-".join(self.transformation_features)}>') + df = df.rename(_output_col_names[0]) return df""" + + # injecting variables into scope used to execute wrapper function. scope = __import__("__main__").__dict__ - scope.update(self.transformation_statistics) + if self.transformation_statistics is not None: + scope.update(self.transformation_statistics) + scope.update({"_output_col_names": self.output_column_names}) + + # executing code exec(code, scope) + + # returning executed function object return eval("renaming_wrapper", scope) - def __call__(self, *args: List[str]): - # TODO : Raise an execption if the number of features are incorrect. - if len(args) != len(self.transformation_features): + def __call__(self, *features: List[str]) -> "HopsworksUdf": + """ + Set features to be passed as arguments to the user defined functions + + # Arguments + features: Name of features to be passed to the User Defined function + # Returns + `HopsworksUdf`: Meta data class for the user defined function. + """ + + if len(features) != len(self.transformation_features): raise FeatureStoreException( "Number of features provided does not match the number of features provided in the UDF definition" ) - for arg in args: + + for arg in features: if not isinstance(arg, str): raise FeatureStoreException( f'Feature names provided must be string "{arg}" is not string' ) - udf = copy.deepcopy( - self - ) # TODO : Clean this copy is needed so that if the uses the same function to multiple feature, if copy not done then all variable would share the same traanformation feature, + # Create a copy of the UDF to associate it with new feature names. + udf = copy.deepcopy(self) + udf._transformation_features = [ TransformationFeature( new_feature_name, transformation_feature.statistic_argument_name ) for transformation_feature, new_feature_name in zip( - self._transformation_features, args + self._transformation_features, features ) ] return udf - def get_udf(self): + def get_udf(self) -> Callable: + """ + Function that checks the current engine type and returns the appropriate UDF. + + In the spark engine the UDF is returned as a pandas UDF. + While in the python engine the UDF is returned as python function. + + # Returns + `Callable`: Pandas UDF in the spark engine otherwise returns a python function for the UDF. + """ if engine.get_type() in ["hive", "python", "training"]: return self.hopsworksUdf_wrapper() else: from pyspark.sql.functions import pandas_udf - # TODO : Make this proper return pandas_udf( f=self.hopsworksUdf_wrapper(), - returnType=self.create_pandas_udf_return_schema_from_list( - self.return_type - ), + returnType=self._create_pandas_udf_return_schema_from_list(), ) - def to_dict(self): + def to_dict(self) -> Dict[str, Any]: + """ + Convert class into a dictionary for json serialization. + + # Returns + `Dict`: Dictionary that contains all data required to json serialize the object. + """ return { "sourceCode": self._original_code, "outputTypes": ",".join( - [python_type.__name__ for python_type in self.return_type] + [python_type.__name__ for python_type in self.output_types] ) - if isinstance(self.return_type, List) - else self.return_type.__name__, + if isinstance(self.output_types, List) + else self.output_types.__name__, "transformationFeatures": self.transformation_features, "name": self._function_name, } def json(self) -> str: + """ + Json serialize object. + + # Returns + `str`: Json serialized object. + """ return json.dumps(self, cls=util.FeatureStoreEncoder) @classmethod def from_response_json( cls: "HopsworksUdf", json_dict: Dict[str, Any] ) -> "HopsworksUdf": + """ + Function that deserializes json obtained from the java backend. + + # Arguments + json_dict: `Dict[str, Any]`. Json serialized dictionary for the class. + # Returns + `HopsworksUdf`: Json deserialized class object. + """ + json_decamelized = humps.decamelize(json_dict) function_source_code = json_decamelized["source_code"] function_name = json_decamelized["name"] @@ -361,41 +546,55 @@ def from_response_json( else cls.STRING_PYTHON_TYPES_MAPPING[return_type], name=function_name, ) - # TODO : Write proper comments for this use case. If we get a transformation function saved in the feature store then it will not have any specific transformaiton feature other than the ones in the code. + + # Set transformation features if already set. if "" not in transformation_features: return hopsworks_udf(*transformation_features) else: return hopsworks_udf @property - def return_type(self): - return self._return_type + def output_types(self) -> List[str]: + """Get the output types of the UDF""" + return self._output_type @property - def function_name(self): + def function_name(self) -> str: + """Get the function name of the UDF""" return self._function_name @property - def function_source(self): - return self._function_source - - @property - def statistics_required(self): + def statistics_required(self) -> bool: + """Get if statistics for any feature is required by the UDF""" return bool(self.statistics_features) @property - def transformation_statistics(self): + def transformation_statistics( + self, + ) -> Optional[Dict[str, FeatureDescriptiveStatistics]]: + """Feature statistics required for the defined UDF""" return self._statistics @property - def transformation_features(self): + def output_column_names(self) -> List[str]: + """Output columns names of the transformation function""" + return self._output_column_names + + @property + def transformation_features(self) -> List[str]: + """ + List of feature names to be used in the User Defined Function. + """ return [ transformation_feature.feature_name for transformation_feature in self._transformation_features ] @property - def statistics_features(self): + def statistics_features(self) -> List[str]: + """ + list of feature names that require statistics + """ return [ transformation_feature.feature_name for transformation_feature in self._transformation_features @@ -403,28 +602,33 @@ def statistics_features(self): ] @property - def statistics_argument_mapping(self): + def _statistics_argument_mapping(self) -> Dict[str, str]: + """ + Dictionary that maps feature names to the statistics arguments names in the User defined function. + """ return { transformation_feature.feature_name: transformation_feature.statistic_argument_name for transformation_feature in self._transformation_features } @transformation_statistics.setter - def transformation_statistics(self, statistics: List[FeatureDescriptiveStatistics]): - # TODO : Clean this up + def transformation_statistics( + self, statistics: List[FeatureDescriptiveStatistics] + ) -> None: self._statistics = dict() for stat in statistics: - if stat.feature_name in self.statistics_argument_mapping.keys(): + if stat.feature_name in self._statistics_argument_mapping.keys(): self._statistics[ - self.statistics_argument_mapping[stat.feature_name] + self._statistics_argument_mapping[stat.feature_name] ] = stat - @property - def transformation_feature_names(self) -> List[str]: - if isinstance(self.return_type, List) and len(self.return_type) > 1: - return [ - f'{self.function_name}<{"-".join(self.transformation_features)}>{{i}}' - for i in range(len(self.return_type)) - ] + @output_column_names.setter + def output_column_names(self, output_col_names: Union[str, List[str]]) -> None: + if not isinstance(output_col_names, List): + output_col_names = [output_col_names] + if len(output_col_names) != len(self.output_types): + raise FeatureStoreException( + f"Provided names for output columns does not match the number of columns returned from the UDF. Please provide {len(self.output_types)} names." + ) else: - return [f'{self.function_name}<{"-".join(self.transformation_features)}>'] + self._output_column_names = output_col_names From 52167f1a930c9ce47d1f19b36dcfb9c34aff75c2 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Fri, 3 May 2024 10:02:24 +0200 Subject: [PATCH 09/58] unit tests for transformation functions --- python/hsfs/hopsworks_udf.py | 40 ++-- python/hsfs/transformation_function.py | 105 ++++++++--- .../transformation_function_fixtures.json | 90 ++++++--- python/tests/test_transformation_function.py | 174 +++++++++++++----- 4 files changed, 300 insertions(+), 109 deletions(-) diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index 6544e9fdd5..b56efb2c5a 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -27,9 +27,10 @@ from hsfs import engine, util from hsfs.client.exceptions import FeatureStoreException from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics +from hsfs.decorators import typechecked -def hopsworks_udf(output_type: Union[List[type], type]): +def hopsworks_udf(output_type: Union[List[type], type]) -> "HopsworksUdf": """ Create an User Defined Function that can be and used within the Hopsworks Feature Store. @@ -58,8 +59,8 @@ def add_one(data1 : pd.Series): `hsfs.client.exceptions.FeatureStoreException` : If unable to create UDF. """ - def wrapper(func: Callable): - udf = HopsworksUdf(func=func, output_type=output_type) + def wrapper(func: Callable) -> HopsworksUdf: + udf = HopsworksUdf(func=func, output_types=output_type) return udf return wrapper @@ -81,13 +82,14 @@ class TransformationFeature: feature_name: str statistic_argument_name: Optional[str] - def to_dict(self): + def to_dict(self) -> Dict[str, Any]: return { "feature_name": self.feature_name, "statistic_argument_name": self.statistic_argument_name, } +@typechecked class HopsworksUdf: """ Meta data for user defined functions. @@ -120,12 +122,12 @@ class HopsworksUdf: def __init__( self, func: Union[Callable, str], - output_type: Union[List[type], type, List[str], str], + output_types: Union[List[type], type, List[str], str], name: Optional[str] = None, transformation_features: Optional[List[TransformationFeature]] = None, ): - self._output_type: List[str] = HopsworksUdf._validate_and_convert_output_types( - output_type + self._output_types: List[str] = HopsworksUdf._validate_and_convert_output_types( + output_types ) self._function_name: str = func.__name__ if name is None else name @@ -253,6 +255,7 @@ def _extract_source_code(udf_function: Callable) -> str: HopsworksUdf._get_module_path(udf_function.__module__) ) except AttributeError: + module_imports = [""] warnings.warn( "Passed UDF defined in a Jupyter notebook. Cannot extract import dependencies from a notebook. Please make sure to import all dependencies for the UDF inside the function.", stacklevel=2, @@ -445,6 +448,8 @@ def __call__(self, *features: List[str]) -> "HopsworksUdf": features: Name of features to be passed to the User Defined function # Returns `HopsworksUdf`: Meta data class for the user defined function. + # Raises + `FeatureStoreException: If the provided number of features do not match the number of arguments in the defined UDF or if the provided feature names are not strings. """ if len(features) != len(self.transformation_features): @@ -533,18 +538,17 @@ def from_response_json( json_decamelized = humps.decamelize(json_dict) function_source_code = json_decamelized["source_code"] function_name = json_decamelized["name"] - return_type = json_decamelized["output_types"].split(",") - transformation_features = json_decamelized["transformation_features"].split(",") + output_types = [ + output_type.strip() + for output_type in json_decamelized["output_types"].split(",") + ] + transformation_features = [ + feature.strip() + for feature in json_decamelized["transformation_features"].split(",") + ] hopsworks_udf = cls( - func=function_source_code, - return_type=[ - cls.STRING_PYTHON_TYPES_MAPPING[python_type] - for python_type in return_type - ] - if isinstance(return_type, List) - else cls.STRING_PYTHON_TYPES_MAPPING[return_type], - name=function_name, + func=function_source_code, output_types=output_types, name=function_name ) # Set transformation features if already set. @@ -556,7 +560,7 @@ def from_response_json( @property def output_types(self) -> List[str]: """Get the output types of the UDF""" - return self._output_type + return self._output_types @property def function_name(self) -> str: diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py index 270027a743..1ba52dea4e 100644 --- a/python/hsfs/transformation_function.py +++ b/python/hsfs/transformation_function.py @@ -15,7 +15,7 @@ from __future__ import annotations import json -from typing import List, Optional +from typing import Any, Dict, List, Optional import humps from hsfs import util @@ -27,22 +27,31 @@ @typechecked class TransformationFunction: + """ + Main DTO class for transformation functions. + + Attributes + ---------- + id (int) : Id of transformation function. + version (int) : Version of transformation function. + hopsworks_udf (HopsworksUdf): Meta data class for user defined functions. + """ + def __init__( self, featurestore_id: int, hopsworks_udf: HopsworksUdf, version: Optional[int] = None, id: Optional[int] = None, - # TODO : Check if the below are actually needed type=None, items=None, count=None, href=None, **kwargs, ): - self._id = id - self._featurestore_id = featurestore_id - self._version = version + self._id: int = id + self._featurestore_id: int = featurestore_id + self._version: int = version self._transformation_function_engine = ( transformation_function_engine.TransformationFunctionEngine( @@ -51,25 +60,26 @@ def __init__( ) if not isinstance(hopsworks_udf, HopsworksUdf): raise FeatureStoreException( - "Use hopsworks_udf decorator when creating the feature view." + "Please use the hopsworks_udf decorator when defining transformation functions." ) - self._hopsworks_udf = hopsworks_udf - self._feature_group_feature_name: Optional[str] = None - self._feature_group_id: Optional[int] = None - def save(self): + self._hopsworks_udf: HopsworksUdf = hopsworks_udf + + def save(self) -> None: """Persist transformation function in backend. !!! example ```python + # import hopsworks udf decorator + from hsfs.hopsworks_udf import HopsworksUdf # define function + @hopsworks_udf(int) def plus_one(value): return value + 1 # create transformation function plus_one_meta = fs.create_transformation_function( transformation_function=plus_one, - output_type=int, version=1 ) @@ -79,19 +89,21 @@ def plus_one(value): """ self._transformation_function_engine.save(self) - def delete(self): + def delete(self) -> None: """Delete transformation function from backend. !!! example ```python + # import hopsworks udf decorator + from hsfs.hopsworks_udf import HopsworksUdf # define function + @hopsworks_udf(int) def plus_one(value): return value + 1 # create transformation function plus_one_meta = fs.create_transformation_function( transformation_function=plus_one, - output_type=int, version=1 ) # persist transformation function in backend @@ -106,15 +118,32 @@ def plus_one(value): """ self._transformation_function_engine.delete(self) - def __call__(self, *args: List[str]): - self._hopsworks_udf = self._hopsworks_udf(*args) + def __call__(self, *features: List[str]) -> TransformationFunction: + """ + Update the feature to be using in the transformation function + + # Arguments + features: Name of features to be passed to the User Defined function + # Returns + `HopsworksUdf`: Meta data class for the user defined function. + # Raises + `FeatureStoreException: If the provided number of features do not match the number of arguments in the defined UDF or if the provided feature names are not strings. + """ + self._hopsworks_udf = self._hopsworks_udf(*features) return self @classmethod - def from_response_json(cls, json_dict): + def from_response_json(cls, json_dict: Dict[str, Any]) -> TransformationFunction: + """ + Function that deserializes json obtained from the java backend. + + # Arguments + json_dict: `Dict[str, Any]`. Json serialized dictionary for the class. + # Returns + `TransformationFunction`: Json deserialized class object. + """ json_decamelized = humps.decamelize(json_dict) - print(json_decamelized) - # TODO : Clean this up. + if "count" in json_decamelized: if json_decamelized["count"] == 0: return [] @@ -131,15 +160,37 @@ def from_response_json(cls, json_dict): ) return cls(**json_decamelized) - def update_from_response_json(self, json_dict): + def update_from_response_json( + self, json_dict: Dict[str, Any] + ) -> TransformationFunction: + """ + Function that updates class based on the response obtained from the java backend. + + # Arguments + json_dict: `Dict[str, Any]`. Json serialized dictionary for the class. + # Returns + `TransformationFunction`: Json deserialized class object. + """ json_decamelized = humps.decamelize(json_dict) self.__init__(**json_decamelized) return self - def json(self): + def json(self) -> str: + """ + Json serialize object. + + # Returns + `str`: Json serialized object. + """ return json.dumps(self, cls=util.FeatureStoreEncoder) - def to_dict(self): + def to_dict(self) -> Dict[str, Any]: + """ + Convert class into a dictionary for json serialization. + + # Returns + `Dict`: Dictionary that contains all data required to json serialize the object. + """ return { "id": self._id, "version": self._version, @@ -153,17 +204,19 @@ def id(self) -> id: return self._id @id.setter - def id(self, id: int): + def id(self, id: int) -> None: self._id = id @property def version(self) -> int: + """Version of the transformation function.""" return self._version + @version.setter + def version(self, version: int) -> None: + self._version = version + @property def hopsworks_udf(self) -> HopsworksUdf: + """Meta data class for the user defined transformation function.""" return self._hopsworks_udf - - @version.setter - def version(self, version: int): - self._version = version diff --git a/python/tests/fixtures/transformation_function_fixtures.json b/python/tests/fixtures/transformation_function_fixtures.json index 504671dffc..98017a07c5 100644 --- a/python/tests/fixtures/transformation_function_fixtures.json +++ b/python/tests/fixtures/transformation_function_fixtures.json @@ -1,16 +1,54 @@ { - "get": { + "get_one_argument_no_statistics_function": { "response": { - "featurestore_id": 11, - "transformation_fn": null, - "version": 1, - "name": "test_name", - "source_code_content": "test_source_code_content", - "builtin_source_code": "test_builtin_source_code", - "output_type": "float", - "id": 43, - "type": "transformationFunctionTDO", - "href": "test_href" + "id" : 1, + "version": 2, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", + "name": "add_one_fs", + "outputTypes":"double", + "transformationFeatures":"col1" + } + } + }, + "get_one_argument_with_statistics_function": { + "response": { + "id" : 1, + "version": 2, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n return data1 + statistics_data1.mean\n", + "name": "add_mean_fs", + "outputTypes":"double", + "transformationFeatures":"data" + } + } + }, + "get_multiple_argument_with_statistics_function": { + "response": { + "id" : 1, + "version": 2, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@hopsworks_udf(str)\ndef test_func(data1 : pd.Series, statistics_data1, data2, statistics_data2, data3):\n return data1 + statistics_data1.mean\n", + "name": "test_func", + "outputTypes":"string", + "transformationFeatures":"feature1, feature2, feature3" + } + } + }, + "get_multiple_return_type_functions": { + "response": { + "id" : 1, + "version": 2, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@hopsworks_udf(str, float)\ndef test_func(data1 : pd.Series, statistics_data1, data2, statistics_data2, data3):\n return pd.DataFrame('col1': ['a', 'b'], 'col2':[1,2])\n", + "name": "test_func", + "outputTypes":"string, double", + "transformationFeatures":"feature1, feature2, feature3" + } } }, "get_basic_info": { @@ -23,16 +61,26 @@ "count": 1, "items": [ { - "featurestore_id": 11, - "transformation_fn": null, + "id" : 1, + "version": 2, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n return data1 + statistics_data1.mean\n", + "name": "add_mean_fs", + "outputTypes":"double", + "transformationFeatures":"data" + } + }, + { + "id" : 2, "version": 1, - "name": "test_name", - "source_code_content": "test_source_code_content", - "builtin_source_code": "test_builtin_source_code", - "output_type": "float", - "id": 43, - "type": "transformationFunctionTDO", - "href": "test_href" + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", + "name": "add_one_fs", + "outputTypes":"double", + "transformationFeatures":"col1" + } } ] } @@ -43,4 +91,4 @@ "items": [] } } -} \ No newline at end of file +} diff --git a/python/tests/test_transformation_function.py b/python/tests/test_transformation_function.py index 41123ff791..0d1f29f346 100644 --- a/python/tests/test_transformation_function.py +++ b/python/tests/test_transformation_function.py @@ -15,85 +15,171 @@ # -from hsfs import transformation_function +from hsfs.transformation_function import TransformationFunction class TestTransformationFunction: - def test_from_response_json(self, backend_fixtures): + def test_from_response_json_one_argument_no_statistics(self, backend_fixtures): # Arrange - json = backend_fixtures["transformation_function"]["get"]["response"] + json = backend_fixtures["transformation_function"][ + "get_one_argument_no_statistics_function" + ]["response"] # Act - tf = transformation_function.TransformationFunction.from_response_json(json) + tf = TransformationFunction.from_response_json(json) # Assert - assert tf.id == 43 + assert tf.id == 1 assert tf._featurestore_id == 11 - assert tf.version == 1 - assert tf.name == "test_name" - assert tf.transformation_fn is None - assert tf.output_type == "FLOAT" + assert tf.version == 2 + assert tf.hopsworks_udf.function_name == "add_one_fs" + assert tf.hopsworks_udf.output_types == ["double"] + assert not tf.hopsworks_udf.statistics_required + assert tf.hopsworks_udf.transformation_features == ["col1"] + assert tf.hopsworks_udf.statistics_features == [] assert ( - tf.source_code_content - == '{"module_imports": "", "transformer_code": "test_builtin_source_code"}' + tf.hopsworks_udf._function_source + == "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n" ) - assert tf._feature_group_feature_name is None - assert tf._feature_group_id is None - def test_from_response_json_basic_info(self, mocker, backend_fixtures): + def test_from_response_json_one_argument_with_statistics(self, backend_fixtures): # Arrange - mocker.patch( - "hsfs.transformation_function.TransformationFunction._load_source_code" + json = backend_fixtures["transformation_function"][ + "get_one_argument_with_statistics_function" + ]["response"] + + # Act + tf = TransformationFunction.from_response_json(json) + + # Assert + assert tf.id == 1 + assert tf._featurestore_id == 11 + assert tf.version == 2 + assert tf.hopsworks_udf.function_name == "add_mean_fs" + assert tf.hopsworks_udf.output_types == ["double"] + assert tf.hopsworks_udf.statistics_required + assert tf.hopsworks_udf.transformation_features == ["data"] + assert tf.hopsworks_udf.statistics_features == ["data"] + assert ( + tf.hopsworks_udf._function_source + == "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n return data1 + statistics_data1.mean\n" ) - json = backend_fixtures["transformation_function"]["get_basic_info"]["response"] + + def test_from_response_json_multiple_argument_with_statistics( + self, backend_fixtures + ): + # Arrange + json = backend_fixtures["transformation_function"][ + "get_multiple_argument_with_statistics_function" + ]["response"] # Act - tf = transformation_function.TransformationFunction.from_response_json(json) + tf = TransformationFunction.from_response_json(json) # Assert - assert tf.id is None + assert tf.id == 1 assert tf._featurestore_id == 11 - assert tf.version is None - assert tf.name is None - assert tf.transformation_fn is None - assert tf.output_type == "STRING" - assert tf.source_code_content is None - assert tf._feature_group_feature_name is None - assert tf._feature_group_id is None + assert tf.version == 2 + assert tf.hopsworks_udf.function_name == "test_func" + assert tf.hopsworks_udf.output_types == ["string"] + assert tf.hopsworks_udf.statistics_required + assert tf.hopsworks_udf.transformation_features == [ + "feature1", + "feature2", + "feature3", + ] + assert tf.hopsworks_udf.statistics_features == ["feature1", "feature2"] + assert ( + tf.hopsworks_udf._function_source + == "\n@hopsworks_udf(str)\ndef test_func(data1 : pd.Series, statistics_data1, data2, statistics_data2, data3):\n return data1 + statistics_data1.mean\n" + ) + + def test_from_response_json_multiple_return_type_functions(self, backend_fixtures): + # Arrange + json = backend_fixtures["transformation_function"][ + "get_multiple_return_type_functions" + ]["response"] + + # Act + tf = TransformationFunction.from_response_json(json) + + # Assert + assert tf.id == 1 + assert tf._featurestore_id == 11 + assert tf.version == 2 + assert tf.hopsworks_udf.function_name == "test_func" + assert tf.hopsworks_udf.output_types == ["string", "double"] + assert tf.hopsworks_udf.statistics_required + assert tf.hopsworks_udf.transformation_features == [ + "feature1", + "feature2", + "feature3", + ] + assert tf.hopsworks_udf.statistics_features == ["feature1", "feature2"] + assert ( + tf.hopsworks_udf._function_source + == "\n@hopsworks_udf(str, float)\ndef test_func(data1 : pd.Series, statistics_data1, data2, statistics_data2, data3):\n return pd.DataFrame('col1': ['a', 'b'], 'col2':[1,2])\n" + ) + + def test_from_response_json_list_empty(self, backend_fixtures): + # Arrange + json = backend_fixtures["transformation_function"]["get_list_empty"]["response"] + + # Act + tf_list = TransformationFunction.from_response_json(json) + + # Assert + assert len(tf_list) == 0 def test_from_response_json_list(self, backend_fixtures): # Arrange json = backend_fixtures["transformation_function"]["get_list"]["response"] # Act - tf_list = transformation_function.TransformationFunction.from_response_json( - json - ) + tf_list = TransformationFunction.from_response_json(json) # Assert - assert len(tf_list) == 1 + assert len(tf_list) == 2 tf = tf_list[0] - assert tf.id == 43 + assert tf.id == 1 + assert tf._featurestore_id == 11 + assert tf.version == 2 + assert tf.hopsworks_udf.function_name == "add_mean_fs" + assert tf.hopsworks_udf.output_types == ["double"] + assert tf.hopsworks_udf.statistics_required + assert tf.hopsworks_udf.transformation_features == ["data"] + assert tf.hopsworks_udf.statistics_features == ["data"] + assert ( + tf.hopsworks_udf._function_source + == "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n return data1 + statistics_data1.mean\n" + ) + + tf = tf_list[1] + assert tf.id == 2 assert tf._featurestore_id == 11 assert tf.version == 1 - assert tf.name == "test_name" - assert tf.transformation_fn is None - assert tf.output_type == "FLOAT" + assert tf.hopsworks_udf.function_name == "add_one_fs" + assert tf.hopsworks_udf.output_types == ["double"] + assert not tf.hopsworks_udf.statistics_required + assert tf.hopsworks_udf.transformation_features == ["col1"] + assert tf.hopsworks_udf.statistics_features == [] assert ( - tf.source_code_content - == '{"module_imports": "", "transformer_code": "test_builtin_source_code"}' + tf.hopsworks_udf._function_source + == "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n" ) - assert tf._feature_group_feature_name is None - assert tf._feature_group_id is None - def test_from_response_json_list_empty(self, backend_fixtures): + +""" + def test_from_response_json_basic_info(self, mocker, backend_fixtures): # Arrange - json = backend_fixtures["transformation_function"]["get_list_empty"]["response"] + json = backend_fixtures["transformation_function"]["get_basic_info"]["response"] # Act - tf_list = transformation_function.TransformationFunction.from_response_json( - json - ) + tf = TransformationFunction.from_response_json(json) # Assert - assert len(tf_list) == 0 + assert tf.id is None + assert tf._featurestore_id == 11 + assert tf.version is None + assert tf.hopsworks_udf is None +""" From a66f9e3d32c0e65cf597dd367c4f2ec1a7f4f59b Mon Sep 17 00:00:00 2001 From: manu-sj Date: Fri, 3 May 2024 16:06:05 +0200 Subject: [PATCH 10/58] clearning transformations engine and adding unit tests --- .../core/transformation_function_engine.py | 181 +- python/hsfs/transformation_function.py | 10 +- .../test_transformation_function_engine.py | 1461 ++--------------- python/tests/test_transformation_function.py | 38 +- 4 files changed, 257 insertions(+), 1433 deletions(-) diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py index e99b79672a..0ad86f0c53 100644 --- a/python/hsfs/core/transformation_function_engine.py +++ b/python/hsfs/core/transformation_function_engine.py @@ -15,12 +15,19 @@ # from __future__ import annotations -import datetime -from functools import partial -from typing import Dict, Optional, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union -import hsfs -import numpy +from hsfs import training_dataset +from hsfs.core import statistics_api, transformation_function_api + + +if TYPE_CHECKING: + import pandas as pd + import polars as pl + import pyspark.sql as ps + from hsfs.feature_view import FeatureView + from hsfs.statistics import Statistics + from hsfs.transformation_function import TransformationFunction from hsfs import ( feature_view, @@ -53,27 +60,58 @@ class TransformationFunctionEngine: def __init__(self, feature_store_id: int): self._feature_store_id = feature_store_id - self._transformation_function_api = ( - transformation_function_api.TransformationFunctionApi(feature_store_id) + self._transformation_function_api: transformation_function_api.TransformationFunctionApi = transformation_function_api.TransformationFunctionApi( + feature_store_id ) - self._statistics_api = statistics_api.StatisticsApi( - feature_store_id, training_dataset.TrainingDataset.ENTITY_TYPE + self._statistics_api: statistics_api.StatisticsApi = ( + statistics_api.StatisticsApi( + feature_store_id, training_dataset.TrainingDataset.ENTITY_TYPE + ) ) self._feature_view_api: Optional["feature_view_api.FeatureViewApi"] = None self._statistics_engine: Optional["statistics_engine.StatisticsEngine"] = None - def save(self, transformation_fn_instance: TransformationFunction): + def save( + self, transformation_fn_instance: TransformationFunction + ) -> TransformationFunction: + """ + Save a transformation function into the feature store. + + # Argument + transformation_fn_instance `TransformationFunction`: The transformation function to be saved into the feature store. + """ self._transformation_function_api.register_transformation_fn( transformation_fn_instance ) - def get_transformation_fn(self, name, version=None): + def get_transformation_fn( + self, name: str, version: Optional[int] = None + ) -> Union[TransformationFunction, List[TransformationFunction]]: + """ + Retrieve a transformation function from the feature store. + + If only the name of the transformation function is provided then all the versions of the transformation functions are returned as a list. + If both name and version are not provided then all transformation functions saved in the feature view is returned. + + # Argument + name ` Optional[str]`: The name of the transformation function to be retrieved. + version `Optional[int]`: The version of the transformation function to be retrieved. + # Returns + `Union[TransformationFunction, List[TransformationFunction]]` : A transformation function if name and version is provided. A list of transformation functions if only name is provided. + """ + transformation_fn_instances = ( self._transformation_function_api.get_transformation_fn(name, version) ) - return transformation_fn_instances[0] + return transformation_fn_instances + + def get_transformation_fns(self) -> List[TransformationFunction]: + """ + Get all the transformation functions in the feature store - def get_transformation_fns(self): + # Returns + `List[TransformationFunction]` : A list of transformation functions. + """ transformation_fn_instances = ( self._transformation_function_api.get_transformation_fn( name=None, version=None @@ -86,89 +124,71 @@ def get_transformation_fns(self): transformation_fns.append(transformation_fn_instance) return transformation_fns - def delete(self, transformation_function_instance): - self._transformation_function_api.delete(transformation_function_instance) - - def get_td_transformation_fn(self, training_dataset): - attached_transformation_fns = ( - self._transformation_function_api.get_td_transformation_fn(training_dataset) - ) - transformation_fn_dict = {} - for attached_transformation_fn in attached_transformation_fns: - transformation_fn_dict[attached_transformation_fn.name] = ( - attached_transformation_fn.transformation_function - ) - return transformation_fn_dict - - @staticmethod - def infer_spark_type(output_type): - # TODO : Move to hopsworks_udf - if not output_type: - return "STRING" # STRING is default type for spark udfs - - if isinstance(output_type, str): - if output_type.endswith("Type()"): - return util.translate_legacy_spark_type(output_type) - output_type = output_type.lower() + def delete(self, transformation_function_instance: TransformationFunction) -> None: + """ + Delete a transformation function from the feature store. - if output_type in (str, "str", "string"): - return "STRING" - elif output_type in (bytes, "binary"): - return "BINARY" - elif output_type in (numpy.int8, "int8", "byte", "tinyint"): - return "BYTE" - elif output_type in (numpy.int16, "int16", "short", "smallint"): - return "SHORT" - elif output_type in (int, "int", "integer", numpy.int32): - return "INT" - elif output_type in (numpy.int64, "int64", "long", "bigint"): - return "LONG" - elif output_type in (float, "float"): - return "FLOAT" - elif output_type in (numpy.float64, "float64", "double"): - return "DOUBLE" - elif output_type in ( - datetime.datetime, - numpy.datetime64, - "datetime", - "timestamp", - ): - return "TIMESTAMP" - elif output_type in (datetime.date, "date"): - return "DATE" - elif output_type in (bool, "boolean", "bool"): - return "BOOLEAN" - else: - raise TypeError("Not supported type %s." % output_type) - - # TODO : about statistics computation and fetching. + # Arguments + transformation_function_instance `TransformationFunction`: The transformation function to be removed from the feature store. + """ + self._transformation_function_api.delete(transformation_function_instance) - # TODO : Think about what to do with label encoder features. @staticmethod def compute_transformation_fn_statistics( - training_dataset_obj, - builtin_tffn_features, - label_encoder_features, - feature_dataframe, - feature_view_obj, - ) -> statistics.Statistics: + training_dataset_obj: training_dataset.TrainingDataset, + statistics_features: List[str], + label_encoder_features: List[str], + feature_dataframe: Union[pd.DataFrame, pl.DataFrame, ps.DataFrame], + feature_view_obj: FeatureView, + ) -> Statistics: + """ + Compute the statistics required for a training dataset object. + + # Arguments + training_dataset_obj `TrainingDataset`: The training dataset for which the statistics is to be computed. + statistics_features `List[str]`: The list of features for which the statistics should be computed. + label_encoder_features `List[str]`: Features used for label encoding. + feature_dataframe `Union[pd.DataFrame, pl.DataFrame, ps.DataFrame]`: The dataframe that contains the data for which the statistics must be computed. + feature_view_obj `FeatureView`: The feature view in which the training data is being created. + # Returns + `Statistics` : The statistics object that contains the statistics for each features. + """ return training_dataset_obj._statistics_engine.compute_transformation_fn_statistics( td_metadata_instance=training_dataset_obj, - columns=builtin_tffn_features, # excluding label encoded features + columns=statistics_features, label_encoder_features=label_encoder_features, # label encoded features only feature_dataframe=feature_dataframe, feature_view_obj=feature_view_obj, ) @staticmethod - def add_feature_statistics(training_dataset, feature_view_obj, dataset): - # TODO : Optimize this code portion check which i better computing all transformation feature statistics together or one by one. - statistics_features = set() + def compute_and_set_feature_statistics( + training_dataset: training_dataset.TrainingDataset, + feature_view_obj: FeatureView, + dataset: Union[ + Dict[str, Union[pd.DataFrame, pl.DataFrame, ps.DataFrame]], + Union[pd.DataFrame, pl.DataFrame, ps.DataFrame], + ], + ) -> None: + """ + Function that computes and sets the statistics required for the UDF used for transformation. + + The function assigns the statistics computed to hopsworks UDF object so that the statistics can be used when UDF is executed. + + # Argument + training_dataset_obj `TrainingDataset`: The training dataset for which the statistics is to be computed. + feature_view `FeatureView`: The feature view in which the training data is being created. + dataset `Union[Dict[str, Union[pd.DataFrame, pl.DataFrame, ps.DataFrame]], Union[pd.DataFrame, pl.DataFrame, ps.DataFrame]]`: A dataframe that conqtains the training data or a dictionary that contains both the training and test data. + """ + statistics_features: Set[str] = set() + + # Finding the features for which statistics is required for transformation_function in feature_view_obj.transformation_functions: statistics_features.update( transformation_function.hopsworks_udf.statistics_features ) + # compute statistics on training data if training_dataset.splits: # compute statistics before transformations are applied stats = TransformationFunctionEngine.compute_transformation_fn_statistics( @@ -179,7 +199,6 @@ def add_feature_statistics(training_dataset, feature_view_obj, dataset): feature_view_obj, ) else: - # compute statistics before transformations are applied stats = TransformationFunctionEngine.compute_transformation_fn_statistics( training_dataset, list(statistics_features), @@ -187,6 +206,8 @@ def add_feature_statistics(training_dataset, feature_view_obj, dataset): dataset, feature_view_obj, ) + + # Set statistics computed in the hopsworks UDF for transformation_function in feature_view_obj.transformation_functions: transformation_function.hopsworks_udf.transformation_statistics = ( stats.feature_descriptive_statistics diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py index 1ba52dea4e..0b209bf5c4 100644 --- a/python/hsfs/transformation_function.py +++ b/python/hsfs/transformation_function.py @@ -15,7 +15,7 @@ from __future__ import annotations import json -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union import humps from hsfs import util @@ -66,7 +66,7 @@ def __init__( self._hopsworks_udf: HopsworksUdf = hopsworks_udf def save(self) -> None: - """Persist transformation function in backend. + """Save a transformation function into the backend. !!! example ```python @@ -123,7 +123,7 @@ def __call__(self, *features: List[str]) -> TransformationFunction: Update the feature to be using in the transformation function # Arguments - features: Name of features to be passed to the User Defined function + features: `List[str]`. Name of features to be passed to the User Defined function # Returns `HopsworksUdf`: Meta data class for the user defined function. # Raises @@ -133,7 +133,9 @@ def __call__(self, *features: List[str]) -> TransformationFunction: return self @classmethod - def from_response_json(cls, json_dict: Dict[str, Any]) -> TransformationFunction: + def from_response_json( + cls, json_dict: Dict[str, Any] + ) -> Union[TransformationFunction, List[TransformationFunction]]: """ Function that deserializes json obtained from the java backend. diff --git a/python/tests/core/test_transformation_function_engine.py b/python/tests/core/test_transformation_function_engine.py index fcbb85ab21..ff3c4f4f85 100644 --- a/python/tests/core/test_transformation_function_engine.py +++ b/python/tests/core/test_transformation_function_engine.py @@ -14,10 +14,7 @@ # limitations under the License. # -import datetime - -import numpy -import pytest +import pandas as pd from hsfs import ( engine, feature, @@ -25,11 +22,9 @@ feature_view, training_dataset, transformation_function, - transformation_function_attached, ) -from hsfs.client.exceptions import FeatureStoreException -from hsfs.constructor.query import Query from hsfs.core import transformation_function_engine +from hsfs.hopsworks_udf import hopsworks_udf fg1 = feature_group.FeatureGroup( @@ -88,9 +83,6 @@ def test_save(self, mocker): # Arrange feature_store_id = 99 - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.is_builtin" - ) mock_tf_api = mocker.patch( "hsfs.core.transformation_function_api.TransformationFunctionApi" ) @@ -99,61 +91,25 @@ def test_save(self, mocker): feature_store_id ) - tf = transformation_function.TransformationFunction( - feature_store_id, builtin_source_code="", output_type="str", name="tf_name" - ) - - # Act - with pytest.raises(ValueError) as e_info: - tf_engine.save(transformation_fn_instance=tf) - - # Assert - assert mock_tf_api.return_value.register_transformation_fn.call_count == 0 - assert ( - str(e_info.value) - == "Transformation function name 'tf_name' with version 1 is reserved for built-in " - "hsfs functions. Please use other name or version" - ) - - def test_save_is_builtin(self, mocker): - # Arrange - feature_store_id = 99 - - mock_tf_engine_is_builtin = mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.is_builtin" - ) - mock_tf_api = mocker.patch( - "hsfs.core.transformation_function_api.TransformationFunctionApi" - ) - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) + @hopsworks_udf(int) + def testFunction(col1): + return col1 + 1 tf = transformation_function.TransformationFunction( - feature_store_id, builtin_source_code="", output_type="str", name="tf_name" + feature_store_id, + hopsworks_udf=testFunction, ) - mock_tf_engine_is_builtin.return_value = False - # Act - with pytest.raises(ValueError) as e_info: - tf_engine.save(transformation_fn_instance=tf) + tf_engine.save(transformation_fn_instance=tf) # Assert - assert mock_tf_api.return_value.register_transformation_fn.call_count == 0 - assert str(e_info.value) == "transformer must be callable" + assert mock_tf_api.return_value.register_transformation_fn.call_count == 1 - def test_save_is_builtin_callable(self, mocker): + def test_get_transformation_fn(self, mocker): # Arrange feature_store_id = 99 - mocker.patch( - "hsfs.transformation_function.TransformationFunction._extract_source_code" - ) - mock_tf_engine_is_builtin = mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.is_builtin" - ) mock_tf_api = mocker.patch( "hsfs.core.transformation_function_api.TransformationFunctionApi" ) @@ -162,43 +118,25 @@ def test_save_is_builtin_callable(self, mocker): feature_store_id ) - def testFunction(): - print("Test") + @hopsworks_udf(int) + def testFunction1(col1): + return col1 + 1 - tf = transformation_function.TransformationFunction( + tf1 = transformation_function.TransformationFunction( feature_store_id, - transformation_fn=testFunction, - builtin_source_code="", - output_type="str", + hopsworks_udf=testFunction1, ) - mock_tf_engine_is_builtin.return_value = False - - # Act - tf_engine.save(transformation_fn_instance=tf) - - # Assert - assert mock_tf_api.return_value.register_transformation_fn.call_count == 1 - - def test_get_transformation_fn(self, mocker): - # Arrange - feature_store_id = 99 - - mock_tf_api = mocker.patch( - "hsfs.core.transformation_function_api.TransformationFunctionApi" - ) + @hopsworks_udf(float) + def testFunction2(data2, statistics_data2): + return data2 + 1 - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id + tf2 = transformation_function.TransformationFunction( + feature_store_id, + hopsworks_udf=testFunction2, ) - tf = transformation_function.TransformationFunction( - feature_store_id, builtin_source_code="", output_type="str", name="tf_name" - ) - tf1 = transformation_function.TransformationFunction( - feature_store_id, builtin_source_code="", output_type="str", name="tf1_name" - ) - transformations = [tf, tf1] + transformations = [tf1, tf2] mock_tf_api.return_value.get_transformation_fn.return_value = transformations @@ -207,7 +145,7 @@ def test_get_transformation_fn(self, mocker): # Assert assert mock_tf_api.return_value.get_transformation_fn.call_count == 1 - assert result == tf + assert result == transformations def test_get_transformation_fns(self, mocker): # Arrange @@ -221,13 +159,25 @@ def test_get_transformation_fns(self, mocker): feature_store_id ) - tf = transformation_function.TransformationFunction( - feature_store_id, builtin_source_code="", output_type="str", name="tf_name" - ) + @hopsworks_udf(int) + def testFunction1(col1): + return col1 + 1 + tf1 = transformation_function.TransformationFunction( - feature_store_id, builtin_source_code="", output_type="str", name="tf1_name" + feature_store_id, + hopsworks_udf=testFunction1, + ) + + @hopsworks_udf(float) + def testFunction2(data2, statistics_data2): + return data2 + 1 + + tf2 = transformation_function.TransformationFunction( + feature_store_id, + hopsworks_udf=testFunction2, ) - transformations = [tf, tf1] + + transformations = [tf1, tf2] mock_tf_api.return_value.get_transformation_fn.return_value = transformations @@ -250,1332 +200,169 @@ def test_delete(self, mocker): feature_store_id ) - # Act - tf_engine.delete(transformation_function_instance=None) - - # Assert - assert mock_tf_api.return_value.delete.call_count == 1 - - def test_get_td_transformation_fn(self, mocker): - # Arrange - feature_store_id = 99 - - mock_tf_api = mocker.patch( - "hsfs.core.transformation_function_api.TransformationFunctionApi" - ) - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def plus_one(a): - return a + 1 - - tf_attached = transformation_function_attached.TransformationFunctionAttached( - name="tf_name", transformation_function=plus_one - ) - tf1_attached = transformation_function_attached.TransformationFunctionAttached( - name="tf1_name", transformation_function=plus_one - ) - - transformations_attached = [tf_attached, tf1_attached] + @hopsworks_udf(int) + def testFunction1(col1): + return col1 + 1 - mock_tf_api.return_value.get_td_transformation_fn.return_value = ( - transformations_attached + tf1 = transformation_function.TransformationFunction( + feature_store_id, + hopsworks_udf=testFunction1, ) # Act - result = tf_engine.get_td_transformation_fn(training_dataset=None) + tf_engine.delete(transformation_function_instance=tf1) # Assert - assert "tf_name" in result - assert "tf1_name" in result - assert mock_tf_api.return_value.get_td_transformation_fn.call_count == 1 + assert mock_tf_api.return_value.delete.call_count == 1 - def test_attach_transformation_fn_td(self, mocker): + def test_compute_transformation_fn_statistics(self, mocker): # Arrange feature_store_id = 99 mocker.patch("hsfs.client.get_instance") - mocker.patch("hsfs.constructor.fs_query.FsQuery") + mock_s_engine = mocker.patch("hsfs.core.statistics_engine.StatisticsEngine") tf_engine = transformation_function_engine.TransformationFunctionEngine( feature_store_id ) - def testFunction(): - print("Test") - - tf = transformation_function.TransformationFunction( - feature_store_id, - transformation_fn=testFunction, - builtin_source_code="", - output_type="str", - ) - - transformation_fn_dict = dict() - - transformation_fn_dict["tf_name"] = tf - transformation_fn_dict["tf1_name"] = tf - td = training_dataset.TrainingDataset( name="test", version=1, data_format="CSV", - featurestore_id=feature_store_id, + featurestore_id=99, splits={}, id=10, - transformation_functions=transformation_fn_dict, ) # Act - with pytest.raises(AttributeError) as e_info: - tf_engine.attach_transformation_fn( - training_dataset_obj=td, feature_view_obj=None - ) + tf_engine.compute_transformation_fn_statistics( + training_dataset_obj=td, + statistics_features=None, + label_encoder_features=None, + feature_dataframe=None, + feature_view_obj=None, + ) # Assert - assert str(e_info.value) == "'TrainingDataset' object has no attribute 'labels'" + assert ( + mock_s_engine.return_value.compute_transformation_fn_statistics.call_count + == 1 + ) - def test_attach_transformation_fn_fv(self, mocker): - # Arrange + def test_compute_and_set_feature_statistics_no_split(self, mocker): feature_store_id = 99 - mocker.patch("hsfs.client.get_instance") + mock_s_engine = mocker.patch("hsfs.core.statistics_engine.StatisticsEngine") tf_engine = transformation_function_engine.TransformationFunctionEngine( feature_store_id ) - def testFunction(): - print("Test") + @hopsworks_udf(int) + def testFunction1(col1): + return col1 + 1 - tf = transformation_function.TransformationFunction( + tf1 = transformation_function.TransformationFunction( feature_store_id, - transformation_fn=testFunction, - builtin_source_code="", - output_type="str", + hopsworks_udf=testFunction1, ) - transformation_fn_dict = dict() - - transformation_fn_dict["tf_name"] = tf - transformation_fn_dict["tf1_name"] = tf - - fv = feature_view.FeatureView( - name="test", - query=query, + fg1 = feature_group.FeatureGroup( + name="test1", + version=1, featurestore_id=99, - transformation_functions=transformation_fn_dict, - labels=[], - ) - - # Act - tf_engine.attach_transformation_fn( - training_dataset_obj=None, feature_view_obj=fv - ) - - # Assert - assert len(fv._features) == 2 - assert fv._features[0].name == "tf_name" - assert fv._features[1].name == "tf1_name" - - def test_attach_transformation_fn_fv_self_join(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch("hsfs.client.get_instance") - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def testFunction(): - print("Test") - - tf = transformation_function.TransformationFunction( - feature_store_id, - transformation_fn=testFunction, - builtin_source_code="", - output_type="str", + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("label")], + id=11, + stream=False, ) - transformation_fn_dict = dict() - - transformation_fn_dict["tf_name"] = tf - transformation_fn_dict["fg1_tf_name"] = tf - - fv = feature_view.FeatureView( + td = training_dataset.TrainingDataset( name="test", - query=query_self_join, + version=1, + data_format="CSV", featurestore_id=99, - transformation_functions=transformation_fn_dict, - labels=[], + splits={}, + id=10, ) # Act - tf_engine.attach_transformation_fn( - training_dataset_obj=None, feature_view_obj=fv - ) - - # Assert - assert len(fv._features) == 2 - assert fv._features[0].name == "tf_name" - assert fv._features[1].name == "fg1_tf_name" - - def test_attach_transformation_fn_fv_q_prefix(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch("hsfs.client.get_instance") - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def testFunction(): - print("Test") - - tf = transformation_function.TransformationFunction( - feature_store_id, - transformation_fn=testFunction, - builtin_source_code="", - output_type="str", - ) - - transformation_fn_dict = dict() - - transformation_fn_dict["tf_name"] = tf - transformation_fn_dict["second_tf1_name"] = tf - transformation_fn_dict["third_tf_name"] = tf - transformation_fn_dict["third_tf1_name"] = tf - fv = feature_view.FeatureView( name="test", - query=query_prefix, - featurestore_id=99, - transformation_functions=transformation_fn_dict, - labels=[], + featurestore_id=feature_store_id, + query=fg1.select_all(), + transformation_functions=[tf1], ) + dataset = pd.DataFrame() + # Act - tf_engine.attach_transformation_fn( - training_dataset_obj=None, feature_view_obj=fv + tf_engine.compute_and_set_feature_statistics( + training_dataset=td, feature_view_obj=fv, dataset=dataset ) # Assert - assert len(fv._features) == 4 - assert fv._features[0].name == "tf_name" - assert fv._features[1].name == "second_tf1_name" - assert fv._features[2].name == "third_tf_name" - assert fv._features[3].name == "third_tf1_name" + assert ( + mock_s_engine.return_value.compute_transformation_fn_statistics.call_count + == 1 + ) - def test_attach_transformation_fn_fv_q_prefix_fail(self, mocker): - # Arrange + def test_compute_and_set_feature_statistics_train_test_split(self, mocker): feature_store_id = 99 - mocker.patch("hsfs.client.get_instance") + mock_s_engine = mocker.patch("hsfs.core.statistics_engine.StatisticsEngine") tf_engine = transformation_function_engine.TransformationFunctionEngine( feature_store_id ) - def testFunction(): - print("Test") - - query_no_prefix = ( - fg1.select_all() - .join(fg2.select(["tf1_name"]), on=["id"]) - .join(fg3.select(["tf_name", "tf1_name"]), on=["id"]) - ) + @hopsworks_udf(int) + def testFunction1(col1): + return col1 + 1 - tf = transformation_function.TransformationFunction( + tf1 = transformation_function.TransformationFunction( feature_store_id, - transformation_fn=testFunction, - builtin_source_code="", - output_type="str", + hopsworks_udf=testFunction1, ) - transformation_fn_dict = dict() - transformation_fn_dict["tf_name"] = tf - transformation_fn_dict["tf1_name"] = tf - - fv = feature_view.FeatureView( - name="test", - query=query_no_prefix, + fg1 = feature_group.FeatureGroup( + name="test1", + version=1, featurestore_id=99, - transformation_functions=transformation_fn_dict, - labels=[], - ) - - # Act - with pytest.raises(FeatureStoreException) as e_info: - tf_engine.attach_transformation_fn( - training_dataset_obj=None, feature_view_obj=fv - ) - - # Assert - assert str(e_info.value) == Query.ERROR_MESSAGE_FEATURE_AMBIGUOUS.format( - "tf_name" - ) - - def test_attach_transformation_fn_fv_labels(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch("hsfs.client.get_instance") - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def testFunction(): - print("Test") - - tf = transformation_function.TransformationFunction( - feature_store_id, - transformation_fn=testFunction, - builtin_source_code="", - output_type="str", + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("label")], + id=11, + stream=False, ) - transformation_fn_dict = dict() - - transformation_fn_dict["tf_name"] = tf - transformation_fn_dict["tf1_name"] = tf - - fv = feature_view.FeatureView( + td = training_dataset.TrainingDataset( name="test", - query=query, + version=1, + data_format="CSV", featurestore_id=99, - transformation_functions=transformation_fn_dict, - labels=["tf_name"], - ) - - # Act - with pytest.raises(ValueError) as e_info: - tf_engine.attach_transformation_fn( - training_dataset_obj=None, feature_view_obj=fv - ) - - # Assert - assert ( - str(e_info.value) - == "Online transformations for training dataset labels are not supported." + splits={"train": 0.8, "test": 0.2}, + id=10, ) - def test_is_builtin(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id + fv = feature_view.FeatureView( + name="test", + featurestore_id=feature_store_id, + query=fg1.select_all(), + transformation_functions=[tf1], ) - tf = transformation_function.TransformationFunction( - feature_store_id, - builtin_source_code="", - output_type="str", - name="tf_name", - version=1, - ) + dataset = pd.DataFrame() # Act - result = tf_engine.is_builtin(transformation_fn_instance=tf) - - # Assert - assert result is False - - def test_is_builtin_min_max_scaler(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - tf = transformation_function.TransformationFunction( - feature_store_id, - builtin_source_code="", - output_type="str", - name="min_max_scaler", - version=1, + tf_engine.compute_and_set_feature_statistics( + training_dataset=td, feature_view_obj=fv, dataset=dataset ) - # Act - result = tf_engine.is_builtin(transformation_fn_instance=tf) - # Assert - assert result is True - - def test_is_builtin_min_max_scaler_version(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - tf = transformation_function.TransformationFunction( - feature_store_id, - builtin_source_code="", - output_type="str", - name="min_max_scaler", - version=2, + assert ( + mock_s_engine.return_value.compute_transformation_fn_statistics.call_count + == 1 ) - - # Act - result = tf_engine.is_builtin(transformation_fn_instance=tf) - - # Assert - assert result is False - - def test_is_builtin_standard_scaler(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - tf = transformation_function.TransformationFunction( - feature_store_id, - builtin_source_code="", - output_type="str", - name="standard_scaler", - version=1, - ) - - # Act - result = tf_engine.is_builtin(transformation_fn_instance=tf) - - # Assert - assert result is True - - def test_is_builtin_robust_scaler(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - tf = transformation_function.TransformationFunction( - feature_store_id, - builtin_source_code="", - output_type="str", - name="robust_scaler", - version=1, - ) - - # Act - result = tf_engine.is_builtin(transformation_fn_instance=tf) - - # Assert - assert result is True - - def test_is_builtin_label_encoder(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - tf = transformation_function.TransformationFunction( - feature_store_id, - builtin_source_code="", - output_type="str", - name="label_encoder", - version=1, - ) - - # Act - result = tf_engine.is_builtin(transformation_fn_instance=tf) - - # Assert - assert result is True - - def test_populate_builtin_fn_arguments(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def tf_name(): - print("Test") - - tf = transformation_function.TransformationFunction( - feature_store_id, transformation_fn=tf_name, output_type="str" - ) - - # Act - with pytest.raises(ValueError) as e_info: - tf_engine.populate_builtin_fn_arguments( - feature_name=None, - transformation_function_instance=tf, - feature_descriptive_stats=None, - ) - - # Assert - assert str(e_info.value) == "Not implemented" - - def test_populate_builtin_fn_arguments_min_max_scaler(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch( - "hsfs.core.builtin_transformation_function.BuiltInTransformationFunction.min_max_scaler_stats", - return_value=(1, 100), - ) - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def min_max_scaler(): - print("Test") - - tf = transformation_function.TransformationFunction( - feature_store_id, transformation_fn=min_max_scaler, output_type="str" - ) - - # Act - tf_engine.populate_builtin_fn_arguments( - feature_name=None, - transformation_function_instance=tf, - feature_descriptive_stats=None, - ) - - # Assert - assert tf.transformation_fn.keywords["min_value"] == 1 - assert tf.transformation_fn.keywords["max_value"] == 100 - - def test_populate_builtin_fn_arguments_standard_scaler(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch( - "hsfs.core.builtin_transformation_function.BuiltInTransformationFunction.standard_scaler_stats", - return_value=(1, 100), - ) - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def standard_scaler(): - print("Test") - - tf = transformation_function.TransformationFunction( - feature_store_id, transformation_fn=standard_scaler, output_type="str" - ) - - # Act - tf_engine.populate_builtin_fn_arguments( - feature_name=None, - transformation_function_instance=tf, - feature_descriptive_stats=None, - ) - - # Assert - assert tf.transformation_fn.keywords["mean"] == 1 - assert tf.transformation_fn.keywords["std_dev"] == 100 - - def test_populate_builtin_fn_arguments_robust_scaler(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch( - "hsfs.core.builtin_transformation_function.BuiltInTransformationFunction.robust_scaler_stats", - return_value={24: 1, 49: 2, 74: 3}, - ) - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def robust_scaler(): - print("Test") - - tf = transformation_function.TransformationFunction( - feature_store_id, transformation_fn=robust_scaler, output_type="str" - ) - - # Act - tf_engine.populate_builtin_fn_arguments( - feature_name=None, - transformation_function_instance=tf, - feature_descriptive_stats=None, - ) - - # Assert - assert tf.transformation_fn.keywords["p25"] == 1 - assert tf.transformation_fn.keywords["p50"] == 2 - assert tf.transformation_fn.keywords["p75"] == 3 - - def test_populate_builtin_fn_arguments_label_encoder(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch( - "hsfs.core.builtin_transformation_function.BuiltInTransformationFunction.encoder_stats", - return_value="test", - ) - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def label_encoder(): - print("Test") - - tf = transformation_function.TransformationFunction( - feature_store_id, transformation_fn=label_encoder, output_type="str" - ) - - # Act - tf_engine.populate_builtin_fn_arguments( - feature_name=None, - transformation_function_instance=tf, - feature_descriptive_stats=None, - ) - - # Assert - assert tf.transformation_fn.keywords["value_to_index"] == "test" - - def test_populate_builtin_attached_fns(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.is_builtin", - return_value=False, - ) - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_fn_arguments" - ) - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def testFunction(): - print("Test") - - tf_attached = transformation_function_attached.TransformationFunctionAttached( - name="tf_name", transformation_function=testFunction - ) - tf1_attached = transformation_function_attached.TransformationFunctionAttached( - name="tf1_name", transformation_function=testFunction - ) - - transformation_fn_dict = dict() - - transformation_fn_dict["tf_name"] = tf_attached - transformation_fn_dict["tf1_name"] = tf1_attached - - # Act - tf_engine.populate_builtin_attached_fns( - attached_transformation_fns=transformation_fn_dict, - feature_descriptive_stats=None, - ) - - # Assert - assert transformation_fn_dict["tf_name"] == tf_attached - assert transformation_fn_dict["tf1_name"] == tf1_attached - - def test_populate_builtin_attached_fns_is_builtin(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.is_builtin" - ) - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_fn_arguments" - ) - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def testFunction(): - print("Test") - - tf_attached = transformation_function_attached.TransformationFunctionAttached( - name="tf_name", transformation_function=testFunction - ) - tf1_attached = transformation_function_attached.TransformationFunctionAttached( - name="tf1_name", transformation_function=testFunction - ) - - transformation_fn_dict = dict() - - transformation_fn_dict["tf_name"] = tf_attached - transformation_fn_dict["tf1_name"] = tf1_attached - - # Act - tf_engine.populate_builtin_attached_fns( - attached_transformation_fns=transformation_fn_dict, - feature_descriptive_stats=None, - ) - - # Assert - assert transformation_fn_dict["tf_name"] != tf_attached - assert transformation_fn_dict["tf1_name"] != tf1_attached - - def test_infer_spark_type_string_type_1(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(str) - - # Assert - assert result == "STRING" - - def test_infer_spark_type_string_type_2(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("str") - - # Assert - assert result == "STRING" - - def test_infer_spark_type_string_type_3(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("string") - - # Assert - assert result == "STRING" - - def test_infer_spark_type_byte_type_1(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(bytes) - result1 = tf_engine.infer_spark_type("BinaryType()") - - # Assert - assert result == "BINARY" - assert result1 == "BINARY" - - def test_infer_spark_type_int8_type_1(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(numpy.int8) - - # Assert - assert result == "BYTE" - - def test_infer_spark_type_int8_type_2(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("int8") - - # Assert - assert result == "BYTE" - - def test_infer_spark_type_int8_type_3(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("byte") - result1 = tf_engine.infer_spark_type("ByteType()") - - # Assert - assert result == "BYTE" - assert result1 == "BYTE" - - def test_infer_spark_type_int16_type_1(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(numpy.int16) - - # Assert - assert result == "SHORT" - - def test_infer_spark_type_int16_type_2(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("int16") - - # Assert - assert result == "SHORT" - - def test_infer_spark_type_int16_type_3(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("short") - result1 = tf_engine.infer_spark_type("ShortType()") - - # Assert - assert result == "SHORT" - assert result1 == "SHORT" - - def test_infer_spark_type_int_type_1(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(int) - - # Assert - assert result == "INT" - - def test_infer_spark_type_int_type_2(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("int") - - # Assert - assert result == "INT" - - def test_infer_spark_type_int_type_3(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(numpy.int32) - result1 = tf_engine.infer_spark_type("IntegerType()") - - # Assert - assert result == "INT" - assert result1 == "INT" - - def test_infer_spark_type_int64_type_1(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(numpy.int64) - - # Assert - assert result == "LONG" - - def test_infer_spark_type_int64_type_2(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("int64") - - # Assert - assert result == "LONG" - - def test_infer_spark_type_int64_type_3(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("long") - - # Assert - assert result == "LONG" - - def test_infer_spark_type_int64_type_4(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("bigint") - result1 = tf_engine.infer_spark_type("LongType()") - - # Assert - assert result == "LONG" - assert result1 == "LONG" - - def test_infer_spark_type_float_type_1(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(float) - - # Assert - assert result == "FLOAT" - - def test_infer_spark_type_float_type_2(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("float") - result1 = tf_engine.infer_spark_type("FloatType()") - - # Assert - assert result == "FLOAT" - assert result1 == "FLOAT" - - def test_infer_spark_type_double_type_1(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(numpy.float64) - - # Assert - assert result == "DOUBLE" - - def test_infer_spark_type_double_type_2(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("float64") - - # Assert - assert result == "DOUBLE" - - def test_infer_spark_type_double_type_3(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("double") - result1 = tf_engine.infer_spark_type("DoubleType()") - - # Assert - assert result == "DOUBLE" - assert result1 == "DOUBLE" - - def test_infer_spark_type_timestamp_type_1(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(datetime.datetime) - - # Assert - assert result == "TIMESTAMP" - - def test_infer_spark_type_timestamp_type_2(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(numpy.datetime64) - result1 = tf_engine.infer_spark_type("TimestampType()") - - # Assert - assert result == "TIMESTAMP" - assert result1 == "TIMESTAMP" - - def test_infer_spark_type_date_type_1(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(datetime.date) - result1 = tf_engine.infer_spark_type("DateType()") - - # Assert - assert result == "DATE" - assert result1 == "DATE" - - def test_infer_spark_type_bool_type_1(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(bool) - - # Assert - assert result == "BOOLEAN" - - def test_infer_spark_type_bool_type_2(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("boolean") - - # Assert - assert result == "BOOLEAN" - - def test_infer_spark_type_bool_type_3(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("bool") - result1 = tf_engine.infer_spark_type("BooleanType()") - - # Assert - assert result == "BOOLEAN" - assert result1 == "BOOLEAN" - - def test_infer_spark_type_wrong_type(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - with pytest.raises(TypeError) as e_info: - tf_engine.infer_spark_type("wrong") - - # Assert - assert str(e_info.value) == "Not supported type wrong." - - def test_compute_transformation_fn_statistics(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch("hsfs.client.get_instance") - mock_s_engine = mocker.patch("hsfs.core.statistics_engine.StatisticsEngine") - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - td = training_dataset.TrainingDataset( - name="test", - version=1, - data_format="CSV", - featurestore_id=99, - splits={}, - id=10, - ) - - # Act - tf_engine.compute_transformation_fn_statistics( - training_dataset_obj=td, - builtin_tffn_features=None, - label_encoder_features=None, - feature_dataframe=None, - feature_view_obj=None, - ) - - # Assert - assert ( - mock_s_engine.return_value.compute_transformation_fn_statistics.call_count - == 1 - ) - - def test_populate_builtin_transformation_functions(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch("hsfs.client.get_instance") - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.is_builtin" - ) - mock_tf_engine_compute_transformation_fn_statistics = mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.compute_transformation_fn_statistics" - ) - mock_tf_engine_populate_builtin_attached_fns = mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_attached_fns" - ) - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def testFunction(): - print("Test") - - tf = transformation_function.TransformationFunction( - feature_store_id, - transformation_fn=testFunction, - builtin_source_code="", - output_type="str", - ) - - def label_encoder(): - print("Test") - - tf_label_encoder = transformation_function.TransformationFunction( - feature_store_id, - transformation_fn=label_encoder, - builtin_source_code="", - output_type="str", - ) - - transformation_fn_dict = dict() - - transformation_fn_dict["tf_name"] = tf - transformation_fn_dict["label_encoder"] = tf_label_encoder - - td = training_dataset.TrainingDataset( - name="test", - version=1, - data_format="CSV", - featurestore_id=feature_store_id, - splits={}, - id=10, - transformation_functions=transformation_fn_dict, - ) - - dataset = mocker.Mock() - - # Act - tf_engine.populate_builtin_transformation_functions( - training_dataset=td, feature_view_obj=None, dataset=dataset - ) - - # Assert - assert mock_tf_engine_compute_transformation_fn_statistics.call_count == 1 - assert mock_tf_engine_populate_builtin_attached_fns.call_count == 1 - assert dataset.get.call_count == 0 - - def test_populate_builtin_transformation_functions_splits(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch("hsfs.client.get_instance") - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.is_builtin" - ) - mock_tf_engine_compute_transformation_fn_statistics = mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.compute_transformation_fn_statistics" - ) - mock_tf_engine_populate_builtin_attached_fns = mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_attached_fns" - ) - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def testFunction(): - print("Test") - - tf = transformation_function.TransformationFunction( - feature_store_id, - transformation_fn=testFunction, - builtin_source_code="", - output_type="str", - ) - - def label_encoder(): - print("Test") - - tf_label_encoder = transformation_function.TransformationFunction( - feature_store_id, - transformation_fn=label_encoder, - builtin_source_code="", - output_type="str", - ) - - transformation_fn_dict = dict() - - transformation_fn_dict["tf_name"] = tf - transformation_fn_dict["label_encoder"] = tf_label_encoder - - td = training_dataset.TrainingDataset( - name="test", - version=1, - data_format="CSV", - featurestore_id=feature_store_id, - splits={"key": "value"}, - id=10, - transformation_functions=transformation_fn_dict, - ) - - dataset = mocker.Mock() - - # Act - tf_engine.populate_builtin_transformation_functions( - training_dataset=td, feature_view_obj=None, dataset=dataset - ) - - # Assert - assert mock_tf_engine_compute_transformation_fn_statistics.call_count == 1 - assert mock_tf_engine_populate_builtin_attached_fns.call_count == 1 - assert dataset.get.call_count == 1 - - # Previously in test_feature_view_engine - def test_get_fv_attached_transformation_fn(self, mocker): - # Arrange - feature_store_id = 99 - mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi") - td_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id=feature_store_id - ) - - def testFunction(): - print("Test") - - tf = transformation_function_attached.TransformationFunctionAttached( - name="tf_name", transformation_function=testFunction - ) - - mock_fv_api.return_value.get_attached_transformation_fn.return_value = tf - - # Act - result = td_engine.get_fv_attached_transformation_fn( - fv_name="fv_name", fv_version=1 - ) - - # Assert - assert "tf_name" in result - assert mock_fv_api.return_value.get_attached_transformation_fn.call_count == 1 - - def test_get_fv_attached_transformation_fn_multiple(self, mocker): - # Arrange - feature_store_id = 99 - - mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi") - - td_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id=feature_store_id - ) - - def testFunction(): - print("Test") - - tf = transformation_function_attached.TransformationFunctionAttached( - name="tf_name", transformation_function=testFunction - ) - tf1 = transformation_function_attached.TransformationFunctionAttached( - name="tf1_name", transformation_function=testFunction - ) - - mock_fv_api.return_value.get_attached_transformation_fn.return_value = [tf, tf1] - - # Act - result = td_engine.get_fv_attached_transformation_fn( - fv_name="fv_name", fv_version=1 - ) - - # Assert - assert "tf_name" in result - assert "tf1_name" in result - assert mock_fv_api.return_value.get_attached_transformation_fn.call_count == 1 diff --git a/python/tests/test_transformation_function.py b/python/tests/test_transformation_function.py index 0d1f29f346..5fdea2987f 100644 --- a/python/tests/test_transformation_function.py +++ b/python/tests/test_transformation_function.py @@ -15,6 +15,9 @@ # +import pytest +from hsfs.client.exceptions import FeatureStoreException +from hsfs.hopsworks_udf import hopsworks_udf from hsfs.transformation_function import TransformationFunction @@ -168,18 +171,29 @@ def test_from_response_json_list(self, backend_fixtures): == "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n" ) + def test_transformation_function_definition_no_hopworks_udf(self): + def test(col1): + return col1 + 1 -""" - def test_from_response_json_basic_info(self, mocker, backend_fixtures): - # Arrange - json = backend_fixtures["transformation_function"]["get_basic_info"]["response"] + with pytest.raises(FeatureStoreException) as exception: + TransformationFunction( + featurestore_id=10, + hopsworks_udf=test, + ) - # Act - tf = TransformationFunction.from_response_json(json) + assert ( + str(exception.value) + == "Please use the hopsworks_udf decorator when defining transformation functions." + ) - # Assert - assert tf.id is None - assert tf._featurestore_id == 11 - assert tf.version is None - assert tf.hopsworks_udf is None -""" + def test_transformation_function_definition_with_hopworks_udf(self): + @hopsworks_udf(int) + def test2(col1): + return col1 + 1 + + tf = TransformationFunction( + featurestore_id=10, + hopsworks_udf=test2, + ) + + assert tf.hopsworks_udf == test2 From 853995a5936df14f5585f0a11fe5bb6e127fff39 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Fri, 3 May 2024 17:33:29 +0200 Subject: [PATCH 11/58] feature view api formated --- python/hsfs/core/feature_view_api.py | 52 ++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/python/hsfs/core/feature_view_api.py b/python/hsfs/core/feature_view_api.py index 6ff621c7db..1bc6b46115 100644 --- a/python/hsfs/core/feature_view_api.py +++ b/python/hsfs/core/feature_view_api.py @@ -73,13 +73,28 @@ def update(self, feature_view_obj: feature_view.FeatureView) -> None: data=feature_view_obj.json(), ) - def get_by_name(self, name: str) -> feature_view.FeatureView: + def get_by_name(self, name: str) -> List[feature_view.FeatureView]: + """ + Get a feature view from the backend using its name. + + # Arguments + name `str`: Name of the feature view. + + # Returns + `List[FeatureView]`: A list that contains all version of the feature view. + + # Raises + `RestAPIError`: If the feature view cannot be found from the backend. + `ValueError`: If the feature group associated with the feature view cannot be found. + """ path = self._base_path + [name] try: return [ feature_view.FeatureView.from_response_json(fv) for fv in self._client._send_request( - self._GET, path, {"expand": ["query", "features"]} + self._GET, + path, + {"expand": ["query", "features", "transformationfunctions"]}, )["items"] ] except RestAPIError as e: @@ -93,6 +108,20 @@ def get_by_name(self, name: str) -> feature_view.FeatureView: raise e def get_by_name_version(self, name: str, version: int) -> feature_view.FeatureView: + """ + Get a feature view form the backend using both name and version + + # Arguments + name `str`: Name of feature view. + version `version`: Version of the feature view. + + # Returns + `FeatureView` + + # Raises + `RestAPIError`: If the feature view cannot be found from the backend. + `ValueError`: If the feature group associated with the feature view cannot be found. + """ path = self._base_path + [name, self._VERSION, version] try: return feature_view.FeatureView.from_response_json( @@ -179,10 +208,21 @@ def get_serving_prepared_statement( def get_attached_transformation_fn( self, name: str, version: int - ) -> Union[ - "transformation_function.TransformationFunction", - List["transformation_function.TransformationFunction"], - ]: + ) -> List["transformation_function.TransformationFunction"]: + """ + Get transformation functions attached to a feature view form the backend + + # Arguments + name `str`: Name of feature view. + version `ìnt`: Version of feature view. + + # Returns + `List[TransformationFunction]` : List of transformation functions attached to the feature view. + + # Raises + `RestAPIError`: If the feature view cannot be found from the backend. + `ValueError`: If the feature group associated with the feature view cannot be found. + """ path = self._base_path + [name, self._VERSION, version, self._TRANSFORMATION] return transformation_function.TransformationFunction.from_response_json( self._client._send_request("GET", path) From b4a37afe2201a7f0959835507450d92d38af8c23 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Sat, 4 May 2024 13:40:04 +0200 Subject: [PATCH 12/58] reformatting and fixing feature_view_engine --- python/hsfs/core/feature_view_engine.py | 79 ++++++++++++--- python/tests/core/test_feature_view_engine.py | 96 ++++++++++++------- 2 files changed, 126 insertions(+), 49 deletions(-) diff --git a/python/hsfs/core/feature_view_engine.py b/python/hsfs/core/feature_view_engine.py index e954701d8e..3305c0e209 100644 --- a/python/hsfs/core/feature_view_engine.py +++ b/python/hsfs/core/feature_view_engine.py @@ -17,7 +17,7 @@ import datetime import warnings -from typing import Optional +from typing import TYPE_CHECKING, List, Optional, Union from hsfs import ( client, @@ -37,11 +37,15 @@ statistics_engine, tags_api, training_dataset_engine, - transformation_function_engine, ) from hsfs.training_dataset_split import TrainingDatasetSplit +if TYPE_CHECKING: + from hsfs.feature_view import FeatureView + from hsfs.transformation_function import TransformationFunction + + class FeatureViewEngine: ENTITY_TYPE = "featureview" _TRAINING_DATA_API_PATH = "trainingdatasets" @@ -53,11 +57,6 @@ def __init__(self, feature_store_id): self._feature_view_api = feature_view_api.FeatureViewApi(feature_store_id) self._tags_api = tags_api.TagsApi(feature_store_id, self.ENTITY_TYPE) - self._transformation_function_engine = ( - transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - ) self._td_code_engine = code_engine.CodeEngine( feature_store_id, self._TRAINING_DATA_API_PATH ) @@ -69,7 +68,16 @@ def __init__(self, feature_store_id): ) self._query_constructor_api = query_constructor_api.QueryConstructorApi() - def save(self, feature_view_obj): + def save(self, feature_view_obj: FeatureView) -> FeatureView: + """ + Save a feature view to the backend. + + # Arguments + feature_view_obj `FeatureView` : The feature view object to be saved. + + # Returns + `FeatureView` : Updated feature view that has the ID used to save in the backend. + """ if feature_view_obj.query.is_time_travel(): warnings.warn( "`as_of` argument in the `Query` will be ignored because" @@ -120,8 +128,6 @@ def save(self, feature_view_obj): ) ) - # TODO : Remove this code portion attaches a transfromation function to a feature. This is not possible with the current implementation - updated_fv = self._feature_view_api.post(feature_view_obj) print( "Feature view created successfully, explore it at \n" @@ -129,11 +135,38 @@ def save(self, feature_view_obj): ) return updated_fv - def update(self, feature_view_obj): + def update(self, feature_view_obj: FeatureView) -> FeatureView: + """ + Update the feature view object saved in the backend + + # Arguments + feature_view_obj `FeatureView` : The feature view object to be saved. + + # Returns + `FeatureView` : Updated feature view that has the ID used to save in the backend. + """ self._feature_view_api.update(feature_view_obj) return feature_view_obj - def get(self, name, version=None): + def get( + self, name: str, version: int = None + ) -> Union[FeatureView, List[FeatureView]]: + """ + Get a feature view form the backend using name or using name and version. + + If version is not provided then a List of feature views containing all of its versions is returned. + + # Arguments + name `str`: Name of feature view. + version `version`: Version of the feature view. + + # Returns + `Union[FeatureView, List[FeatureView]]` + + # Raises + `RestAPIError`: If the feature view cannot be found from the backend. + `ValueError`: If the feature group associated with the feature view cannot be found. + """ if version: fv = self._feature_view_api.get_by_name_version(name, version) else: @@ -232,6 +265,28 @@ def get_batch_query_string( return fs_query.pit_query return fs_query.query + def get_attached_transformation_fn( + self, name: str, version: int + ) -> List[TransformationFunction]: + """ + Get transformation functions attached to a feature view form the backend + + # Arguments + name `str`: Name of feature view. + version `ìnt`: Version of feature view. + + # Returns + `List[TransformationFunction]` : List of transformation functions attached to the feature view. + + # Raises + `RestAPIError`: If the feature view cannot be found from the backend. + `ValueError`: If the feature group associated with the feature view cannot be found. + """ + transformation_functions = ( + self._feature_view_api.get_attached_transformation_fn(name, version) + ) + return transformation_functions + def create_training_dataset( self, feature_view_obj, diff --git a/python/tests/core/test_feature_view_engine.py b/python/tests/core/test_feature_view_engine.py index d8410aa21e..e50868285d 100644 --- a/python/tests/core/test_feature_view_engine.py +++ b/python/tests/core/test_feature_view_engine.py @@ -23,14 +23,15 @@ feature_view, split_statistics, training_dataset, - transformation_function_attached, ) from hsfs.client.exceptions import FeatureStoreException from hsfs.constructor import fs_query from hsfs.constructor.query import Query from hsfs.core import arrow_flight_client, feature_view_engine from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics +from hsfs.hopsworks_udf import hopsworks_udf from hsfs.storage_connector import BigQueryConnector, StorageConnector +from hsfs.transformation_function import TransformationFunction engine.init("python") @@ -95,9 +96,6 @@ def test_save(self, mocker): "hsfs.core.feature_view_engine.FeatureViewEngine._get_feature_view_url", return_value=feature_view_url, ) - mock_attach_transformation = mocker.patch( - "hsfs.core.feature_view_engine.FeatureViewEngine.attach_transformation_function", - ) mock_print = mocker.patch("builtins.print") fv_engine = feature_view_engine.FeatureViewEngine( @@ -113,7 +111,6 @@ def test_save(self, mocker): # Assert assert mock_fv_api.return_value.post.call_count == 1 - assert mock_attach_transformation.call_count == 1 assert mock_print.call_count == 1 assert mock_print.call_args[0][ 0 @@ -353,10 +350,7 @@ def test_get_name(self, mocker): mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi") mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.get_fv_attached_transformation_fn" - ) - mock_attach_transformation = mocker.patch( - "hsfs.core.feature_view_engine.FeatureViewEngine.attach_transformation_function", + "hsfs.core.feature_view_engine.FeatureViewEngine.get_attached_transformation_fn" ) fv_engine = feature_view_engine.FeatureViewEngine( @@ -385,7 +379,6 @@ def test_get_name(self, mocker): # Assert assert mock_fv_api.return_value.get_by_name_version.call_count == 0 - assert mock_attach_transformation.call_count == 2 assert mock_fv_api.return_value.get_by_name.call_count == 1 assert len(result) == 2 @@ -395,10 +388,7 @@ def test_get_name_version(self, mocker): mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi") mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.get_fv_attached_transformation_fn" - ) - mock_attach_transformation = mocker.patch( - "hsfs.core.feature_view_engine.FeatureViewEngine.attach_transformation_function", + "hsfs.core.feature_view_engine.FeatureViewEngine.get_attached_transformation_fn" ) fv_engine = feature_view_engine.FeatureViewEngine( @@ -420,7 +410,6 @@ def test_get_name_version(self, mocker): # Assert assert mock_fv_api.return_value.get_by_name_version.call_count == 1 - assert mock_attach_transformation.call_count == 1 assert mock_fv_api.return_value.get_by_name.call_count == 0 def test_delete_name(self, mocker): @@ -566,40 +555,73 @@ def test_get_batch_query_string_pit_query(self, mocker): assert mock_fv_api.return_value.get_batch_query.call_count == 1 assert mock_qc_api.return_value.construct_query.call_count == 1 - def test_attach_transformation_function(self, mocker): - def testFunction(): - print("Test") + def test_get_attached_transformation_fn(self, mocker): + # Arrange + feature_store_id = 99 - tf = transformation_function_attached.TransformationFunctionAttached( - name="tf_name", transformation_function=testFunction + mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi") + + fv_engine = feature_view_engine.FeatureViewEngine( + feature_store_id=feature_store_id ) - mocker.patch("hsfs.core.feature_view_api.FeatureViewApi") - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.get_fv_attached_transformation_fn", - return_value={"label": tf}, + + @hopsworks_udf(int) + def test2(col1): + return col1 + 1 + + tf = TransformationFunction( + featurestore_id=10, + hopsworks_udf=test2, ) + + mock_fv_api.return_value.get_attached_transformation_fn.return_value = [tf] + + # Act + result = fv_engine.get_attached_transformation_fn(name="fv_name", version=1) + + # Assert + assert result == [tf] + assert mock_fv_api.return_value.get_attached_transformation_fn.call_count == 1 + + def test_get_attached_transformation_fn_multiple(self, mocker): + # Arrange feature_store_id = 99 + + mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi") + fv_engine = feature_view_engine.FeatureViewEngine( feature_store_id=feature_store_id ) - fv = feature_view.FeatureView( - name="fv_name", - version=1, - query=query, - featurestore_id=feature_store_id, + + @hopsworks_udf(int) + def test1(col1): + return col1 + 1 + + tf1 = TransformationFunction( + featurestore_id=10, + hopsworks_udf=test1, + ) + + @hopsworks_udf(int) + def test2(col1): + return col1 + 2 + + tf2 = TransformationFunction( + featurestore_id=10, + hopsworks_udf=test2, ) - fv.schema = query.features + + mock_fv_api.return_value.get_attached_transformation_fn.return_value = [ + tf1, + tf2, + ] # Act - fv_engine.attach_transformation_function(fv) + result = fv_engine.get_attached_transformation_fn(name="fv_name", version=1) # Assert - id_feature = fv.schema[0] - label_feature = fv.schema[1] - assert id_feature.name == "id" - assert id_feature.transformation_function is None - assert label_feature.name == "label" - assert label_feature.transformation_function == tf + assert result == [tf1, tf2] + assert mock_fv_api.return_value.get_attached_transformation_fn.call_count == 1 def test_create_training_dataset(self, mocker): # Arrange From 2a6250074f8befa8f686f9f17b49607215c8411a Mon Sep 17 00:00:00 2001 From: manu-sj Date: Sat, 4 May 2024 14:43:49 +0200 Subject: [PATCH 13/58] reformatted and added unit tests for feature view --- python/hsfs/feature_view.py | 61 ++++++++++----- .../tests/fixtures/feature_view_fixtures.json | 4 +- .../transformation_function_fixtures.json | 7 +- python/tests/test_feature_view.py | 75 +++++++++++++------ 4 files changed, 95 insertions(+), 52 deletions(-) diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py index 386e3b256f..837bc168c2 100644 --- a/python/hsfs/feature_view.py +++ b/python/hsfs/feature_view.py @@ -123,19 +123,14 @@ def __init__( training_helper_columns if training_helper_columns else [] ) - # TODO : Clean this up - if transformation_functions: - for i, transformation_function in enumerate(transformation_functions): - if not isinstance(transformation_function, TransformationFunction): - transformation_functions[i] = TransformationFunction( - self.featurestore_id, - hopsworks_udf=transformation_function, - version=1, - ) - - self._transformation_functions: List[TransformationFunction] = ( - transformation_functions - ) + self._transformation_functions: List[TransformationFunction] = [ + TransformationFunction( + self.featurestore_id, hopsworks_udf=transformation_function, version=1 + ) + if not isinstance(transformation_function, TransformationFunction) + else transformation_function + for transformation_function in transformation_functions + ] self._features = [] self._feature_view_engine: feature_view_engine.FeatureViewEngine = ( @@ -3396,6 +3391,14 @@ def create_feature_monitoring( @classmethod def from_response_json(cls, json_dict: Dict[str, Any]) -> "FeatureView": + """ + Function that constructs the class object from its json serialization. + + # Arguments + json_dict: `Dict[str, Any]`. Json serialized dictionary for the class. + # Returns + `TransformationFunction`: Json deserialized class object. + """ json_decamelized = humps.decamelize(json_dict) serving_keys = json_decamelized.get("serving_keys", None) @@ -3403,6 +3406,7 @@ def from_response_json(cls, json_dict: Dict[str, Any]) -> "FeatureView": serving_keys = [ skm.ServingKey.from_response_json(sk) for sk in serving_keys ] + transformation_functions = json_decamelized.get("transformation_functions", {}) fv = cls( id=json_decamelized.get("id", None), name=json_decamelized["name"], @@ -3412,12 +3416,11 @@ def from_response_json(cls, json_dict: Dict[str, Any]) -> "FeatureView": description=json_decamelized.get("description", None), featurestore_name=json_decamelized.get("featurestore_name", None), serving_keys=serving_keys, - transformation_functions=[ - TransformationFunction.from_response_json(transformation) - for transformation in json_decamelized.get( - "transformation_functions", [] - ) - ], + transformation_functions=TransformationFunction.from_response_json( + transformation_functions + ) + if transformation_functions + else [], ) features = json_decamelized.get("features", []) if features: @@ -3439,6 +3442,14 @@ def from_response_json(cls, json_dict: Dict[str, Any]) -> "FeatureView": return fv def update_from_response_json(self, json_dict: Dict[str, Any]) -> "FeatureView": + """ + Function that updates the class object from its json serialization. + + # Arguments + json_dict: `Dict[str, Any]`. Json serialized dictionary for the class. + # Returns + `TransformationFunction`: Json deserialized class object. + """ other = self.from_response_json(json_dict) for key in [ "name", @@ -3480,9 +3491,21 @@ def _init_feature_monitoring_engine(self) -> None: ) def json(self) -> str: + """ + Convert class into its json serialized form. + + # Returns + `str`: Json serialized object. + """ return json.dumps(self, cls=util.FeatureStoreEncoder) def to_dict(self) -> Dict[str, Any]: + """ + Convert class into a dictionary. + + # Returns + `Dict`: Dictionary that contains all data required to json serialize the object. + """ return { "featurestoreId": self._featurestore_id, "name": self._name, diff --git a/python/tests/fixtures/feature_view_fixtures.json b/python/tests/fixtures/feature_view_fixtures.json index aabf2bf9f6..92601b46da 100644 --- a/python/tests/fixtures/feature_view_fixtures.json +++ b/python/tests/fixtures/feature_view_fixtures.json @@ -159,9 +159,7 @@ "id": 11, "version": 1, "description": "test_description", - "transformation_functions": { - "featurestore_id": 5 - }, + "transformation_functions": {}, "features": [ { "name": "intt", diff --git a/python/tests/fixtures/transformation_function_fixtures.json b/python/tests/fixtures/transformation_function_fixtures.json index 98017a07c5..5b8e753508 100644 --- a/python/tests/fixtures/transformation_function_fixtures.json +++ b/python/tests/fixtures/transformation_function_fixtures.json @@ -51,14 +51,9 @@ } } }, - "get_basic_info": { - "response": { - "featurestore_id": 11 - } - }, "get_list": { "response": { - "count": 1, + "count": 2, "items": [ { "id" : 1, diff --git a/python/tests/test_feature_view.py b/python/tests/test_feature_view.py index 25a1cc6fbe..e8e36c0f1e 100644 --- a/python/tests/test_feature_view.py +++ b/python/tests/test_feature_view.py @@ -15,9 +15,10 @@ # import warnings -from hsfs import feature_view, training_dataset_feature, transformation_function +from hsfs import feature_view, training_dataset_feature from hsfs.constructor import fs_query, query from hsfs.feature_store import FeatureStore +from hsfs.hopsworks_udf import hopsworks_udf class TestFeatureView: @@ -32,7 +33,6 @@ def test_from_response_json(self, mocker, backend_fixtures): mocker.patch("hsfs.engine.get_type") mocker.patch("hsfs.core.feature_store_api.FeatureStoreApi.get") json = backend_fixtures["feature_view"]["get"]["response"] - # Act fv = feature_view.FeatureView.from_response_json(json) @@ -44,7 +44,7 @@ def test_from_response_json(self, mocker, backend_fixtures): assert fv.version == 1 assert fv.description == "test_description" assert fv.labels == ["intt"] - assert fv.transformation_functions == {} + assert fv.transformation_functions == [] assert len(fv.schema) == 2 assert isinstance(fv.schema[0], training_dataset_feature.TrainingDatasetFeature) @@ -65,10 +65,50 @@ def test_from_response_json_basic_info(self, mocker, backend_fixtures): assert fv.version is None assert fv.description is None assert fv.labels == [] - assert fv.transformation_functions == {} + assert fv.transformation_functions == [] assert len(fv.schema) == 0 assert fv.query._left_feature_group.deprecated is False + def test_from_response_json_transformation_function(self, mocker, backend_fixtures): + # Arrange + mocker.patch.object( + FeatureStore, + "project_id", + return_value=99, + ) + mocker.patch("hsfs.client.get_instance") + mocker.patch("hsfs.engine.get_type") + mocker.patch("hsfs.core.feature_store_api.FeatureStoreApi.get") + json = backend_fixtures["feature_view"]["get_transformations"]["response"] + # Act + fv = feature_view.FeatureView.from_response_json(json) + + # Assert + assert fv.name == "test_name" + assert fv.id == 11 + assert isinstance(fv.query, query.Query) + assert fv.featurestore_id == 5 + assert fv.version == 1 + assert fv.description == "test_description" + assert fv.labels == ["intt"] + assert len(fv.transformation_functions) == 2 + assert ( + fv.transformation_functions[0].hopsworks_udf.function_name == "add_mean_fs" + ) + assert ( + fv.transformation_functions[1].hopsworks_udf.function_name == "add_one_fs" + ) + assert ( + fv.transformation_functions[0].hopsworks_udf._function_source + == "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n return data1 + statistics_data1.mean\n" + ) + assert ( + fv.transformation_functions[1].hopsworks_udf._function_source + == "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n" + ) + assert len(fv.schema) == 2 + assert isinstance(fv.schema[0], training_dataset_feature.TrainingDatasetFeature) + def test_from_response_json_basic_info_deprecated(self, mocker, backend_fixtures): # Arrange mocker.patch("hsfs.engine.get_type") @@ -87,7 +127,7 @@ def test_from_response_json_basic_info_deprecated(self, mocker, backend_fixtures assert fv.version is None assert fv.description is None assert fv.labels == [] - assert fv.transformation_functions == {} + assert fv.transformation_functions == [] assert len(fv.schema) == 0 assert fv.query._left_feature_group.deprecated is True assert len(warning_record) == 1 @@ -104,31 +144,18 @@ def test_transformation_function_instances(self, mocker, backend_fixtures): # Act q = fs_query.FsQuery.from_response_json(json) - def testFunction(): - print("Test") - - tf = transformation_function.TransformationFunction( - feature_store_id, - transformation_fn=testFunction, - builtin_source_code="", - output_type="str", - ) - - transformation_fn_dict = dict() - transformation_fn_dict["tf_name"] = tf - transformation_fn_dict["tf1_name"] = tf + @hopsworks_udf(int) + def test(col1): + return col1 + 1 fv = feature_view.FeatureView( featurestore_id=feature_store_id, name="test_fv", version=1, query=q, - transformation_functions=transformation_fn_dict, + transformation_functions=[test("data1"), test("data2")], ) - updated_transformation_fn_dict = fv.transformation_functions + transformation_functions = fv.transformation_functions - assert ( - updated_transformation_fn_dict["tf_name"] - != updated_transformation_fn_dict["tf1_name"] - ) + assert transformation_functions[0] != transformation_functions[1] From 35d72dc298c1d6816bdec74f87d279f1f852ee0f Mon Sep 17 00:00:00 2001 From: manu-sj Date: Sat, 4 May 2024 15:09:24 +0200 Subject: [PATCH 14/58] updating documentation for feature store --- python/hsfs/feature_store.py | 64 +++++++++++------------------------- 1 file changed, 20 insertions(+), 44 deletions(-) diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py index 24033bf11b..e2ee0f9cc9 100644 --- a/python/hsfs/feature_store.py +++ b/python/hsfs/feature_store.py @@ -22,7 +22,6 @@ import great_expectations as ge import humps -import numpy import numpy as np import pandas as pd import polars as pl @@ -1283,35 +1282,20 @@ def create_training_dataset( def create_transformation_function( self, transformation_function: callable, - output_type: Union[ - str, - bytes, - int, - numpy.int8, - numpy.int16, - numpy.int32, - numpy.int64, - float, - numpy.float64, - datetime.datetime, - numpy.datetime64, - datetime.date, - bool, - ], version: Optional[int] = None, ) -> "TransformationFunction": """Create a transformation function metadata object. !!! example ```python - # define function + # define the transformation function as a Hopsworks's UDF + @hopsworks_udf(int) def plus_one(value): return value + 1 # create transformation function plus_one_meta = fs.create_transformation_function( transformation_function=plus_one, - output_type=int, version=1 ) @@ -1325,8 +1309,7 @@ def plus_one(value): call the `save()` method of the transformation function metadata object. # Arguments - transformation_function: callable object. - output_type: python or numpy output type that will be inferred as pyspark.sql.types type. + transformation_function: Hopsworks UDF. # Returns: `TransformationFunction`: The TransformationFunction metadata object. @@ -1334,7 +1317,6 @@ def plus_one(value): return TransformationFunction( featurestore_id=self._id, transformation_fn=transformation_function, - output_type=output_type, version=version, ) @@ -1392,9 +1374,7 @@ def get_transformation_function( name='feature_view_name', query=query, labels=["target_column"], - transformation_functions={ - "column_to_transform": min_max_scaler - } + transformation_functions=[min_max_scaler("feature1")] ) ``` @@ -1421,12 +1401,12 @@ def get_transformation_function( name='transactions_view', query=query, labels=["fraud_label"], - transformation_functions = { - "category_column": label_encoder, - "weight": robust_scaler, - "age": min_max_scaler, - "salary": standard_scaler - } + transformation_functions = [ + label_encoder("category_column"), + robust_scaler("weight"), + min_max_scaler("age"), + standard_scaler("salary") + ] ) ``` @@ -1486,11 +1466,13 @@ def create_feature_view( # construct the query query = fg1.select_all().join(fg2.select_all()) - # get the transformation functions - standard_scaler = fs.get_transformation_function(name='standard_scaler') + # define the transformation function as a Hopsworks's UDF + @hopsworks_udf(int) + def plus_one(value): + return value + 1 - # construct dictionary of "feature - transformation function" pairs - transformation_functions = {col_name: standard_scaler for col_name in df.columns} + # construct list of "transformation functions" on features + transformation_functions = {plus_one("feature1"), plus_one("feature1"))} feature_view = fs.create_feature_view( name='air_quality_fv', @@ -1508,7 +1490,7 @@ def create_feature_view( # define query object query = ... - # define dictionary with column names and transformation functions pairs + # define list of transformation functions mapping_transformers = ... # create feature view @@ -1554,10 +1536,7 @@ def create_feature_view( Training helper columns can be optionally fetched with training data. For more details see documentation for feature view's get training data methods. Defaults to `[], no training helper columns. - transformation_functions: A dictionary mapping tansformation functions to - to the features they should be applied to before writing out the - vector and at inference time. Defaults to `{}`, no - transformations. + transformation_functions: A list of Hopsworks UDF's. Defaults to `None`, no transformations. # Returns: `FeatureView`: The feature view metadata object. @@ -1632,10 +1611,7 @@ def get_or_create_feature_view( Training helper columns can be optionally fetched with training data. For more details see documentation for feature view's get training data methods. Defaults to `[], no training helper columns. - transformation_functions: A dictionary mapping tansformation functions to - to the features they should be applied to before writing out the - vector and at inference time. Defaults to `{}`, no - transformations. + transformation_functions: A list of Hopsworks UDF's. Defaults to `None`, no transformations. # Returns: `FeatureView`: The feature view metadata object. @@ -1655,7 +1631,7 @@ def get_or_create_feature_view( labels=labels or [], inference_helper_columns=inference_helper_columns or [], training_helper_columns=training_helper_columns or [], - transformation_functions=transformation_functions or {}, + transformation_functions=transformation_functions or [], ) else: raise e From 7ca35fda44cb1b4c9a80943d5cf95421d9338671 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Sat, 4 May 2024 15:11:10 +0200 Subject: [PATCH 15/58] updating documentation for feature store --- python/hsfs/feature_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py index e2ee0f9cc9..10f6a269bc 100644 --- a/python/hsfs/feature_store.py +++ b/python/hsfs/feature_store.py @@ -1472,7 +1472,7 @@ def plus_one(value): return value + 1 # construct list of "transformation functions" on features - transformation_functions = {plus_one("feature1"), plus_one("feature1"))} + transformation_functions = [plus_one("feature1"), plus_one("feature1"))] feature_view = fs.create_feature_view( name='air_quality_fv', From 5e377e6ddcdc74947eadfae6cce876479005f560 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Sun, 5 May 2024 17:02:24 +0200 Subject: [PATCH 16/58] fixed tests for training datatset features --- .../training_dataset_feature_fixtures.json | 21 +------------------ python/tests/test_training_dataset_feature.py | 8 +------ 2 files changed, 2 insertions(+), 27 deletions(-) diff --git a/python/tests/fixtures/training_dataset_feature_fixtures.json b/python/tests/fixtures/training_dataset_feature_fixtures.json index 19a958b970..f48fd0fabd 100644 --- a/python/tests/fixtures/training_dataset_feature_fixtures.json +++ b/python/tests/fixtures/training_dataset_feature_fixtures.json @@ -62,26 +62,7 @@ "timeTravelFormat": "HUDI" }, "feature_group_feature_name": "test_feature_group_feature_name", - "label": "test_label", - "transformation_function": { - "count": 1, - "items": [ - { - "featurestore_id": 11, - "transformation_fn": null, - "version": 1, - "name": "test_name", - "source_code_content": "test_source_code_content", - "builtin_source_code": "test_builtin_source_code", - "output_type": "float", - "id": 43, - "type": "transformationFunctionTDO", - "items": [], - "count": 0, - "href": "test_href" - } - ] - } + "label": "test_label" } }, "get_fraud_online_training_dataset_features": { diff --git a/python/tests/test_training_dataset_feature.py b/python/tests/test_training_dataset_feature.py index 62a30aca5a..dc5af26112 100644 --- a/python/tests/test_training_dataset_feature.py +++ b/python/tests/test_training_dataset_feature.py @@ -15,7 +15,7 @@ # -from hsfs import feature_group, training_dataset_feature, transformation_function +from hsfs import feature_group, training_dataset_feature class TestTrainingDatasetFeature: @@ -37,11 +37,6 @@ def test_from_response_json(self, backend_fixtures): td_feature._feature_group_feature_name == "test_feature_group_feature_name" ) assert td_feature.label == "test_label" - assert len(td_feature.transformation_function) == 1 - assert isinstance( - td_feature.transformation_function[0], - transformation_function.TransformationFunction, - ) def test_from_response_json_basic_info(self, backend_fixtures): # Arrange @@ -61,4 +56,3 @@ def test_from_response_json_basic_info(self, backend_fixtures): assert td_feature._feature_group is None assert td_feature._feature_group_feature_name is None assert td_feature.label is False - assert td_feature.transformation_function is None From fa1203224b105a31df1c68034e0ac4ab7ebe9b9d Mon Sep 17 00:00:00 2001 From: manu-sj Date: Mon, 6 May 2024 09:14:10 +0200 Subject: [PATCH 17/58] reformatted and added unit tests for python engine --- python/hsfs/__init__.py | 9 ++ python/hsfs/core/feature_view_engine.py | 9 +- .../core/transformation_function_engine.py | 129 +++++---------- python/hsfs/engine/python.py | 153 +++++++++++------- python/hsfs/feature_view.py | 22 ++- python/hsfs/hopsworks_udf.py | 7 +- python/hsfs/transformation_function.py | 8 +- .../test_transformation_function_engine.py | 106 ++++++++++++ python/tests/engine/test_python.py | 92 +++++------ 9 files changed, 323 insertions(+), 212 deletions(-) diff --git a/python/hsfs/__init__.py b/python/hsfs/__init__.py index 31efe17c56..d0297cb25e 100644 --- a/python/hsfs/__init__.py +++ b/python/hsfs/__init__.py @@ -19,8 +19,17 @@ import warnings import nest_asyncio +from packaging.version import Version +try: + import pandas as pd + + if Version(pd.__version__) > Version(2.0): + os.environ["USE_PYARROW_EXTENSION"] = "1" +except ImportError: + pass # Empty except block because environment variable "USE_PYARROW_EXTENSION" need not be set if pyarrow cannot be imported or if pandas version is less than 2.0 + # Setting polars skip cpu flag to suppress CPU false positive warning messages printed while importing hsfs os.environ["POLARS_SKIP_CPU_CHECK"] = "1" diff --git a/python/hsfs/core/feature_view_engine.py b/python/hsfs/core/feature_view_engine.py index 3305c0e209..491be2c95e 100644 --- a/python/hsfs/core/feature_view_engine.py +++ b/python/hsfs/core/feature_view_engine.py @@ -395,7 +395,12 @@ def get_training_data( spine=spine, ) split_df = engine.get_instance().get_training_data( - td_updated, feature_view_obj, query, read_options, dataframe_type + td_updated, + feature_view_obj, + query, + read_options, + dataframe_type, + training_dataset_version, ) self.compute_training_dataset_statistics( feature_view_obj, td_updated, split_df @@ -720,7 +725,6 @@ def _get_training_dataset_metadata( ) # schema and transformation functions need to be set for writing training data or feature serving td.schema = feature_view_obj.schema - td.transformation_functions = feature_view_obj.transformation_functions return td def _get_training_datasets_metadata(self, feature_view_obj): @@ -730,7 +734,6 @@ def _get_training_datasets_metadata(self, feature_view_obj): # schema and transformation functions need to be set for writing training data or feature serving for td in tds: td.schema = feature_view_obj.schema - td.transformation_functions = feature_view_obj.transformation_functions return tds def get_training_datasets(self, feature_view_obj): diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py index 0ad86f0c53..89808b3db1 100644 --- a/python/hsfs/core/transformation_function_engine.py +++ b/python/hsfs/core/transformation_function_engine.py @@ -18,7 +18,7 @@ from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union from hsfs import training_dataset -from hsfs.core import statistics_api, transformation_function_api +from hsfs.core import transformation_function_api if TYPE_CHECKING: @@ -63,13 +63,6 @@ def __init__(self, feature_store_id: int): self._transformation_function_api: transformation_function_api.TransformationFunctionApi = transformation_function_api.TransformationFunctionApi( feature_store_id ) - self._statistics_api: statistics_api.StatisticsApi = ( - statistics_api.StatisticsApi( - feature_store_id, training_dataset.TrainingDataset.ENTITY_TYPE - ) - ) - self._feature_view_api: Optional["feature_view_api.FeatureViewApi"] = None - self._statistics_engine: Optional["statistics_engine.StatisticsEngine"] = None def save( self, transformation_fn_instance: TransformationFunction @@ -213,92 +206,46 @@ def compute_and_set_feature_statistics( stats.feature_descriptive_statistics ) - def get_ready_to_use_transformation_fns( - self, - entity: Union[hsfs.feature_view.FeatureView, training_dataset.TrainingDataset], - training_dataset_version: Optional[int] = None, - ) -> Dict[ - str, hsfs.transformation_function_attached.TransformationFunctionAttached - ]: - is_feat_view = isinstance(entity, feature_view.FeatureView) - if self._feature_view_api is None: - self._feature_view_api = feature_view_api.FeatureViewApi( - self._feature_store_id - ) - if self._statistics_engine is None: - self._statistics_engine = statistics_engine.StatisticsEngine( - self._feature_store_id, - entity_type="featureview" if is_feat_view else "trainingdataset", - ) - # get attached transformation functions - transformation_functions = ( - self.get_td_transformation_fn(entity) - if isinstance(entity, training_dataset.TrainingDataset) - else (self.get_fv_attached_transformation_fn(entity.name, entity.version)) - ) - is_stat_required = ( - len( - set(self.BUILTIN_FN_NAMES).intersection( - set([tf.name for tf in transformation_functions.values()]) - ) - ) - > 0 + @staticmethod + def get_and_set_feature_statistics( + training_dataset: training_dataset.TrainingDataset, + feature_view_obj: FeatureView, + training_dataset_version: int = None, + ) -> None: + """ + Function that gets the transformation statistics computed while creating the training dataset from the backend and assigns it to the hopsworks UDF object. + + The function assigns the statistics computed to hopsworks UDF object so that the statistics can be used when UDF is executed. + + # Argument + training_dataset_obj `TrainingDataset`: The training dataset for which the statistics is to be computed. + feature_view `FeatureView`: The feature view in which the training data is being created. + training_dataset_version `int`: The version of the training dataset for which the statistics is to be retrieved. + + # Raises + `ValueError` : If the statistics are not present in the backend. + """ + + is_stat_required = any( + [ + tf.hopsworks_udf.statistics_required + for tf in feature_view_obj.transformation_functions + ] ) - if not is_stat_required: - td_tffn_stats = None - else: - # if there are any built-in transformation functions get related statistics and - # populate with relevant arguments - # there should be only one statistics object with before_transformation=true - if is_feat_view and training_dataset_version is None: - raise ValueError( - "Training data version is required for transformation. Call `feature_view.init_serving(version)` " - "or `feature_view.init_batch_scoring(version)` to pass the training dataset version." - "Training data can be created by `feature_view.create_training_data` or `feature_view.training_data`." - ) - td_tffn_stats = self._statistics_engine.get( - entity, + + if is_stat_required: + td_tffn_stats = training_dataset._statistics_engine.get( + feature_view_obj, before_transformation=True, training_dataset_version=training_dataset_version, ) - if is_stat_required and td_tffn_stats is None: - raise ValueError( - "No statistics available for initializing transformation functions." - + "Training data can be created by `feature_view.create_training_data` or `feature_view.training_data`." - ) - - transformation_fns = self.populate_builtin_attached_fns( - transformation_functions, - td_tffn_stats.feature_descriptive_statistics - if td_tffn_stats is not None - else None, - ) - return transformation_fns + if td_tffn_stats is None: + raise ValueError( + "No statistics available for initializing transformation functions." + ) - def get_fv_attached_transformation_fn( - self, fv_name: str, fv_version: int - ) -> Dict[str, "transformation_function_attached.TransformationFunctionAttached"]: - if self._feature_view_api is None: - self._feature_view_api = feature_view_api.FeatureViewApi( - self._feature_store_id - ) - self._statistics_engine = statistics_engine.StatisticsEngine( - self._feature_store_id, - entity_type="featureview", - ) - transformation_functions = ( - self._feature_view_api.get_attached_transformation_fn(fv_name, fv_version) - ) - if isinstance(transformation_functions, list): - transformation_functions_dict = dict( - [ - (tf.name, tf.transformation_function) - for tf in transformation_functions - ] - ) - else: - transformation_functions_dict = { - transformation_functions.name: transformation_functions.transformation_function - } - return transformation_functions_dict + for transformation_function in feature_view_obj.transformation_functions: + transformation_function.hopsworks_udf.transformation_statistics = ( + td_tffn_stats.feature_descriptive_statistics + ) diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py index 9754b96997..42814ab079 100644 --- a/python/hsfs/engine/python.py +++ b/python/hsfs/engine/python.py @@ -878,7 +878,22 @@ def get_training_data( query_obj: query.Query, read_options: Dict[str, Any], dataframe_type: str, + training_dataset_version: int = None, ) -> Union[pd.DataFrame, pl.DataFrame]: + """ + Function that creates or retrieves already created the training dataset. + + # Arguments + training_dataset_obj `TrainingDataset`: The training dataset metadata object. + feature_view_obj `FeatureView`: The feature view object for the which the training data is being created. + query_obj `Query`: The query object that contains the query used to create the feature view. + read_options `Dict[str, Any]`: Dictionary that can be used to specify extra parameters for reading data. + dataframe_type `str`: The type of dataframe returned. + training_dataset_version `int`: Version of training data to be retrieved. + # Raises + `ValueError`: If the training dataset statistics could not be retrieved. + """ + # dataframe_type of list and numpy are prevented here because statistics needs to be computed from the returned dataframe. # The daframe is converted into required types in the function split_labels if dataframe_type.lower() not in ["default", "polars", "pandas"]: @@ -891,15 +906,20 @@ def get_training_data( feature_view_obj, read_options, dataframe_type, + training_dataset_version, ) else: df = query_obj.read( read_options=read_options, dataframe_type=dataframe_type ) - # TODO : Add statistics - transformation_function_engine.TransformationFunctionEngine.add_feature_statistics( - training_dataset_obj, feature_view_obj, df - ) + if training_dataset_version is None: + transformation_function_engine.TransformationFunctionEngine.compute_and_set_feature_statistics( + training_dataset_obj, feature_view_obj, df + ) + else: + transformation_function_engine.TransformationFunctionEngine.get_and_set_feature_statistics( + training_dataset_obj, feature_view_obj, training_dataset_version + ) return self._apply_transformation_function( training_dataset_obj.transformation_functions, df ) @@ -934,10 +954,21 @@ def _prepare_transform_split_df( feature_view_obj: feature_view.FeatureView, read_option: Dict[str, Any], dataframe_type: str, + training_dataset_version: int = None, ) -> Dict[str, Union[pd.DataFrame, pl.DataFrame]]: """ Split a df into slices defined by `splits`. `splits` is a `dict(str, int)` which keys are name of split and values are split ratios. + + # Arguments + query_obj `Query`: The query object that contains the query used to create the feature view. + training_dataset_obj `TrainingDataset`: The training dataset metadata object. + feature_view_obj `FeatureView`: The feature view object for the which the training data is being created. + read_options `Dict[str, Any]`: Dictionary that can be used to specify extra parameters for reading data. + dataframe_type `str`: The type of dataframe returned. + training_dataset_version `int`: Version of training data to be retrieved. + # Raises + `ValueError`: If the training dataset statistics could not be retrieved. """ if ( training_dataset_obj.splits[0].split_type @@ -970,11 +1001,14 @@ def _prepare_transform_split_df( training_dataset_obj, ) - # apply transformations - # 1st parametrise transformation functions with dt split stats - transformation_function_engine.TransformationFunctionEngine.add_feature_statistics( - training_dataset_obj, feature_view_obj, result_dfs - ) + if training_dataset_version is None: + transformation_function_engine.TransformationFunctionEngine.compute_and_set_feature_statistics( + training_dataset_obj, feature_view_obj, result_dfs + ) + else: + transformation_function_engine.TransformationFunctionEngine.get_and_set_feature_statistics( + training_dataset_obj, feature_view_obj, training_dataset_version + ) # and the apply them for split_name in result_dfs: result_dfs[split_name] = self._apply_transformation_function( @@ -1153,8 +1187,24 @@ def _create_hive_connection( def _return_dataframe_type( self, dataframe: Union[pd.DataFrame, pl.DataFrame], dataframe_type: str ) -> Union[pd.DataFrame, pl.DataFrame, np.ndarray, List[List[Any]]]: - if dataframe_type.lower() in ["default", "pandas", "polars"]: + """ + Returns a dataframe of particular type. + + # Arguments + dataframe `Union[pd.DataFrame, pl.DataFrame]`: Input dataframe + dataframe_type `str`: Type of dataframe to be returned + # Returns + `Union[pd.DataFrame, pl.DataFrame, np.array, list]`: DataFrame of required type. + """ + if dataframe_type.lower() in ["default", "pandas"]: return dataframe + if dataframe_type.lower() == "polars": + if not ( + isinstance(dataframe, pl.DataFrame) or isinstance(dataframe, pl.Series) + ): + return pl.from_pandas(dataframe) + else: + return dataframe if dataframe_type.lower() == "numpy": return dataframe.values if dataframe_type.lower() == "python": @@ -1235,66 +1285,55 @@ def _apply_transformation_function( transformation_functions: List[TransformationFunction], dataset: Union[pd.DataFrame, pl.DataFrame], ) -> Union[pd.DataFrame, pl.DataFrame]: + """ + Apply transformation function to the dataframe. + + # Arguments + transformation_functions `List[TransformationFunction]` : List of transformation functions. + dataset `Union[pd.DataFrame, pl.DataFrame]`: A pandas or polars dataframe. + # Raises + `FeatureStoreException`: If any of the features mentioned in the transformation function is not present in the Feature View. + """ transformed_features = set() + + if isinstance(dataset, pl.DataFrame) or isinstance( + dataset, pl.dataframe.frame.DataFrame + ): + # Converting polars dataframe to pandas because currently we support only pandas UDF's as transformation functions. + if os.getenv("USE_PYARROW_EXTENSION", False): + dataset = dataset.to_pandas( + use_pyarrow_extension_array=True + ) # Zero copy if pyarrow extension can be used. + else: + dataset = dataset.to_pandas(use_pyarrow_extension_array=False) + for transformation_function in transformation_functions: hopsworks_udf = transformation_function.hopsworks_udf missing_features = set(hopsworks_udf.transformation_features) - set( dataset.columns ) - - # TODO : Add documentation link in exception if missing_features: raise FeatureStoreException( - f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly. Refer .." + f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly." ) transformed_features.update( transformation_function.hopsworks_udf.transformation_features ) - - if isinstance(dataset, pl.DataFrame) or isinstance( - dataset, pl.dataframe.frame.DataFrame - ): - pass - else: - dataset = pd.concat( - [ - dataset, - transformation_function.hopsworks_udf.get_udf()( - *( - [ - dataset[feature] - for feature in transformation_function.hopsworks_udf.transformation_features - ] - ) - ), - ], - axis=1, - ) - # TODO : Think about what to do in cases where the output is a polars dataframe..... - # if isinstance(dataset, pl.DataFrame) or isinstance( - # dataset, pl.dataframe.frame.DataFrame - # ): - # dataset = dataset.with_columns( - # pl.col(feature_name).map_elements( - # transformation_fn.transformation_fn - # ) - # ) - # else: - - # TODO : Think if below code is actually required - - # The below functions is not required for Polars since polars does have object types like pandas - # if not ( - # isinstance(dataset, pl.DataFrame) - # or isinstance(dataset, pl.dataframe.frame.DataFrame) - # ): - # offline_type = Engine.convert_spark_type_to_offline_type( - # transformation_fn.output_type - # ) - # dataset[feature_name] = Engine._cast_column_to_offline_type( - # dataset[feature_name], offline_type - # ) + dataset = pd.concat( + [ + dataset, + transformation_function.hopsworks_udf.get_udf()( + *( + [ + dataset[feature] + for feature in transformation_function.hopsworks_udf.transformation_features + ] + ) + ), + ], + axis=1, + ) dataset = dataset.drop(transformed_features, axis=1) return dataset diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py index 837bc168c2..7c8a914dd4 100644 --- a/python/hsfs/feature_view.py +++ b/python/hsfs/feature_view.py @@ -123,14 +123,20 @@ def __init__( training_helper_columns if training_helper_columns else [] ) - self._transformation_functions: List[TransformationFunction] = [ - TransformationFunction( - self.featurestore_id, hopsworks_udf=transformation_function, version=1 - ) - if not isinstance(transformation_function, TransformationFunction) - else transformation_function - for transformation_function in transformation_functions - ] + self._transformation_functions: List[TransformationFunction] = ( + [ + TransformationFunction( + self.featurestore_id, + hopsworks_udf=transformation_function, + version=1, + ) + if not isinstance(transformation_function, TransformationFunction) + else transformation_function + for transformation_function in transformation_functions + ] + if transformation_functions + else [] + ) self._features = [] self._feature_view_engine: feature_view_engine.FeatureViewEngine = ( diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index b56efb2c5a..34edaf4a64 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -473,6 +473,7 @@ def __call__(self, *features: List[str]) -> "HopsworksUdf": self._transformation_features, features ) ] + udf.output_column_names = udf._get_output_column_names() return udf def get_udf(self) -> Callable: @@ -497,7 +498,7 @@ def get_udf(self) -> Callable: def to_dict(self) -> Dict[str, Any]: """ - Convert class into a dictionary for json serialization. + Convert class into a dictionary. # Returns `Dict`: Dictionary that contains all data required to json serialize the object. @@ -515,7 +516,7 @@ def to_dict(self) -> Dict[str, Any]: def json(self) -> str: """ - Json serialize object. + Convert class into its json serialized form. # Returns `str`: Json serialized object. @@ -527,7 +528,7 @@ def from_response_json( cls: "HopsworksUdf", json_dict: Dict[str, Any] ) -> "HopsworksUdf": """ - Function that deserializes json obtained from the java backend. + Function that constructs the class object from its json serialization. # Arguments json_dict: `Dict[str, Any]`. Json serialized dictionary for the class. diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py index 0b209bf5c4..4e23853c73 100644 --- a/python/hsfs/transformation_function.py +++ b/python/hsfs/transformation_function.py @@ -137,7 +137,7 @@ def from_response_json( cls, json_dict: Dict[str, Any] ) -> Union[TransformationFunction, List[TransformationFunction]]: """ - Function that deserializes json obtained from the java backend. + Function that constructs the class object from its json serialization. # Arguments json_dict: `Dict[str, Any]`. Json serialized dictionary for the class. @@ -166,7 +166,7 @@ def update_from_response_json( self, json_dict: Dict[str, Any] ) -> TransformationFunction: """ - Function that updates class based on the response obtained from the java backend. + Function that updates the class object from its json serialization. # Arguments json_dict: `Dict[str, Any]`. Json serialized dictionary for the class. @@ -179,7 +179,7 @@ def update_from_response_json( def json(self) -> str: """ - Json serialize object. + Convert class into its json serialized form. # Returns `str`: Json serialized object. @@ -188,7 +188,7 @@ def json(self) -> str: def to_dict(self) -> Dict[str, Any]: """ - Convert class into a dictionary for json serialization. + Convert class into a dictionary. # Returns `Dict`: Dictionary that contains all data required to json serialize the object. diff --git a/python/tests/core/test_transformation_function_engine.py b/python/tests/core/test_transformation_function_engine.py index ff3c4f4f85..29e20f3cac 100644 --- a/python/tests/core/test_transformation_function_engine.py +++ b/python/tests/core/test_transformation_function_engine.py @@ -366,3 +366,109 @@ def testFunction1(col1): mock_s_engine.return_value.compute_transformation_fn_statistics.call_count == 1 ) + + def test_get_and_set_feature_statistics_no_statistics_required(self, mocker): + feature_store_id = 99 + mocker.patch("hsfs.client.get_instance") + mock_s_engine = mocker.patch("hsfs.core.statistics_engine.StatisticsEngine") + + tf_engine = transformation_function_engine.TransformationFunctionEngine( + feature_store_id + ) + + @hopsworks_udf(int) + def testFunction1(col1): + return col1 + 1 + + tf1 = transformation_function.TransformationFunction( + feature_store_id, + hopsworks_udf=testFunction1, + ) + + fg1 = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("label")], + id=11, + stream=False, + ) + + td = training_dataset.TrainingDataset( + name="test", + version=1, + data_format="CSV", + featurestore_id=99, + splits={"train": 0.8, "test": 0.2}, + id=10, + ) + + fv = feature_view.FeatureView( + name="test", + featurestore_id=feature_store_id, + query=fg1.select_all(), + transformation_functions=[tf1], + ) + + # Act + tf_engine.get_and_set_feature_statistics( + training_dataset=td, feature_view_obj=fv, training_dataset_version=1 + ) + + # Assert + assert mock_s_engine.return_value.get.call_count == 0 + + def test_get_and_set_feature_statistics_statistics_required(self, mocker): + feature_store_id = 99 + mocker.patch("hsfs.client.get_instance") + mock_s_engine = mocker.patch("hsfs.core.statistics_engine.StatisticsEngine") + + tf_engine = transformation_function_engine.TransformationFunctionEngine( + feature_store_id + ) + + @hopsworks_udf(int) + def testFunction1(col1, statistics_col1): + return col1 + statistics_col1.mean + + tf1 = transformation_function.TransformationFunction( + feature_store_id, + hopsworks_udf=testFunction1, + ) + + fg1 = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("label")], + id=11, + stream=False, + ) + + td = training_dataset.TrainingDataset( + name="test", + version=1, + data_format="CSV", + featurestore_id=99, + splits={"train": 0.8, "test": 0.2}, + id=10, + ) + + fv = feature_view.FeatureView( + name="test", + featurestore_id=feature_store_id, + query=fg1.select_all(), + transformation_functions=[tf1], + ) + + # Act + tf_engine.get_and_set_feature_statistics( + training_dataset=td, feature_view_obj=fv, training_dataset_version=1 + ) + + # Assert + assert mock_s_engine.return_value.get.call_count == 1 diff --git a/python/tests/engine/test_python.py b/python/tests/engine/test_python.py index 08bc8d52a7..88ff95a34b 100644 --- a/python/tests/engine/test_python.py +++ b/python/tests/engine/test_python.py @@ -23,12 +23,12 @@ import pytest from confluent_kafka.admin import PartitionMetadata, TopicMetadata from hsfs import ( + engine, feature, feature_group, feature_view, storage_connector, training_dataset, - transformation_function, util, ) from hsfs.client import exceptions @@ -36,10 +36,14 @@ from hsfs.constructor.hudi_feature_group_alias import HudiFeatureGroupAlias from hsfs.core import inode, job from hsfs.engine import python +from hsfs.hopsworks_udf import hopsworks_udf from hsfs.training_dataset_feature import TrainingDatasetFeature from polars.testing import assert_frame_equal as polars_assert_frame_equal +engine._engine_type = "python" + + class TestPython: def test_sql(self, mocker): # Arrange @@ -2423,7 +2427,7 @@ def test_split_labels_labels_dataframe_type_polars(self): result_df, result_df_split = python_engine.split_labels( df=df, dataframe_type="polars", labels="col1" ) - print(type(result_df_split)) + # Assert assert isinstance(result_df, pl.DataFrame) or isinstance( result_df, pl.dataframe.frame.DataFrame @@ -3233,41 +3237,39 @@ def test_apply_transformation_function_pandas(self, mocker): python_engine = python.Engine() - def plus_one(a): - return a + 1 + @hopsworks_udf(int) + def plus_one(col1): + return col1 + 1 - tf = transformation_function.TransformationFunction( - 99, - transformation_fn=plus_one, - builtin_source_code="", - output_type="int", + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, ) - transformation_fn_dict = dict() - - transformation_fn_dict["tf_name"] = tf - - td = training_dataset.TrainingDataset( - name="test", - version=1, - data_format="CSV", + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), featurestore_id=99, - splits={}, - id=10, - transformation_functions=transformation_fn_dict, + transformation_functions=[plus_one("tf_name")], ) df = pd.DataFrame(data={"tf_name": [1, 2]}) # Act result = python_engine._apply_transformation_function( - transformation_functions=td.transformation_functions, dataset=df + transformation_functions=fv.transformation_functions, dataset=df ) # Assert - assert len(result["tf_name"]) == 2 - assert result["tf_name"][0] == 2 - assert result["tf_name"][1] == 3 + assert len(result["plus_one_tf_name_"]) == 2 + assert result["plus_one_tf_name_"][0] == 2 + assert result["plus_one_tf_name_"][1] == 3 def test_apply_transformation_function_polars(self, mocker): # Arrange @@ -3275,41 +3277,39 @@ def test_apply_transformation_function_polars(self, mocker): python_engine = python.Engine() - def plus_one(a): - return a + 1 + @hopsworks_udf(int) + def plus_one(col1): + return col1 + 1 - tf = transformation_function.TransformationFunction( - 99, - transformation_fn=plus_one, - builtin_source_code="", - output_type="int", + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, ) - transformation_fn_dict = dict() - - transformation_fn_dict["tf_name"] = tf - - td = training_dataset.TrainingDataset( - name="test", - version=1, - data_format="CSV", + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), featurestore_id=99, - splits={}, - id=10, - transformation_functions=transformation_fn_dict, + transformation_functions=[plus_one("tf_name")], ) df = pl.DataFrame(data={"tf_name": [1, 2]}) # Act result = python_engine._apply_transformation_function( - transformation_functions=td.transformation_functions, dataset=df + transformation_functions=fv.transformation_functions, dataset=df ) # Assert - assert len(result["tf_name"]) == 2 - assert result["tf_name"][0] == 2 - assert result["tf_name"][1] == 3 + assert len(result["plus_one_tf_name_"]) == 2 + assert result["plus_one_tf_name_"][0] == 2 + assert result["plus_one_tf_name_"][1] == 3 def test_get_unique_values(self): # Arrange From f79a3492010df0f475b97eae3732cca1cb8f3811 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Mon, 6 May 2024 22:48:14 +0200 Subject: [PATCH 18/58] most unit tests fixed --- python/hsfs/__init__.py | 2 +- python/hsfs/core/feature_view_engine.py | 4 +- .../hsfs/core/transformation_function_api.py | 26 -- .../core/transformation_function_engine.py | 52 +-- python/hsfs/core/vector_server.py | 8 +- python/hsfs/engine/python.py | 18 +- python/hsfs/engine/spark.py | 156 +++++---- python/hsfs/hopsworks_udf.py | 44 ++- .../hsfs/transformation_function_attached.py | 71 ---- python/pyproject.toml | 1 + python/tests/core/test_arrow_flight_client.py | 3 - python/tests/core/test_feature_view_engine.py | 1 - .../core/test_training_dataset_engine.py | 7 +- .../test_transformation_function_engine.py | 4 +- python/tests/engine/test_python.py | 90 ++++- python/tests/engine/test_spark.py | 324 ++++++++++++++---- python/tests/fixtures/backend_fixtures.py | 1 - python/tests/pyproject.toml | 8 + .../test_transformation_function_attached.py | 88 ----- 19 files changed, 539 insertions(+), 369 deletions(-) delete mode 100644 python/hsfs/transformation_function_attached.py delete mode 100644 python/tests/test_transformation_function_attached.py diff --git a/python/hsfs/__init__.py b/python/hsfs/__init__.py index d0297cb25e..82d368d243 100644 --- a/python/hsfs/__init__.py +++ b/python/hsfs/__init__.py @@ -25,7 +25,7 @@ try: import pandas as pd - if Version(pd.__version__) > Version(2.0): + if Version(pd.__version__) > Version("2.0"): os.environ["USE_PYARROW_EXTENSION"] = "1" except ImportError: pass # Empty except block because environment variable "USE_PYARROW_EXTENSION" need not be set if pyarrow cannot be imported or if pandas version is less than 2.0 diff --git a/python/hsfs/core/feature_view_engine.py b/python/hsfs/core/feature_view_engine.py index 491be2c95e..19ea348b97 100644 --- a/python/hsfs/core/feature_view_engine.py +++ b/python/hsfs/core/feature_view_engine.py @@ -723,7 +723,7 @@ def _get_training_dataset_metadata( td = self._feature_view_api.get_training_dataset_by_version( feature_view_obj.name, feature_view_obj.version, training_dataset_version ) - # schema and transformation functions need to be set for writing training data or feature serving + # schema needs to be set for writing training data or feature serving td.schema = feature_view_obj.schema return td @@ -731,7 +731,7 @@ def _get_training_datasets_metadata(self, feature_view_obj): tds = self._feature_view_api.get_training_datasets( feature_view_obj.name, feature_view_obj.version ) - # schema and transformation functions need to be set for writing training data or feature serving + # schema needs to be set for writing training data or feature serving for td in tds: td.schema = feature_view_obj.schema return tds diff --git a/python/hsfs/core/transformation_function_api.py b/python/hsfs/core/transformation_function_api.py index a0f21f0097..f6692f8f62 100644 --- a/python/hsfs/core/transformation_function_api.py +++ b/python/hsfs/core/transformation_function_api.py @@ -19,9 +19,7 @@ from hsfs import ( client, - training_dataset, transformation_function, - transformation_function_attached, ) @@ -112,27 +110,3 @@ def delete( ] headers = {"content-type": "application/json"} _client._send_request("DELETE", path_params, headers=headers) - - def get_td_transformation_fn( - self, training_dataset_instance: training_dataset.TrainingDataset - ) -> transformation_function_attached.TransformationFunctionAttached: - """ - Retrieve TransformationFunctionAttached instance - Args: - training_dataset_instance: TrainingDataset, required - training dataset metadata object. - """ - _client = client.get_instance() - path_params = [ - "project", - _client._project_id, - "featurestores", - self._feature_store_id, - "trainingdatasets", - training_dataset_instance.id, - "transformationfunctions", - ] - - return transformation_function_attached.TransformationFunctionAttached.from_response_json( - _client._send_request("GET", path_params) - ) diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py index 89808b3db1..2396cb1a03 100644 --- a/python/hsfs/core/transformation_function_engine.py +++ b/python/hsfs/core/transformation_function_engine.py @@ -180,31 +180,35 @@ def compute_and_set_feature_statistics( statistics_features.update( transformation_function.hopsworks_udf.statistics_features ) + if statistics_features: + # compute statistics on training data + if training_dataset.splits: + # compute statistics before transformations are applied + stats = ( + TransformationFunctionEngine.compute_transformation_fn_statistics( + training_dataset, + list(statistics_features), + [], + dataset.get(training_dataset.train_split), + feature_view_obj, + ) + ) + else: + stats = ( + TransformationFunctionEngine.compute_transformation_fn_statistics( + training_dataset, + list(statistics_features), + [], + dataset, + feature_view_obj, + ) + ) - # compute statistics on training data - if training_dataset.splits: - # compute statistics before transformations are applied - stats = TransformationFunctionEngine.compute_transformation_fn_statistics( - training_dataset, - list(statistics_features), - [], - dataset.get(training_dataset.train_split), - feature_view_obj, - ) - else: - stats = TransformationFunctionEngine.compute_transformation_fn_statistics( - training_dataset, - list(statistics_features), - [], - dataset, - feature_view_obj, - ) - - # Set statistics computed in the hopsworks UDF - for transformation_function in feature_view_obj.transformation_functions: - transformation_function.hopsworks_udf.transformation_statistics = ( - stats.feature_descriptive_statistics - ) + # Set statistics computed in the hopsworks UDF + for transformation_function in feature_view_obj.transformation_functions: + transformation_function.hopsworks_udf.transformation_statistics = ( + stats.feature_descriptive_statistics + ) @staticmethod def get_and_set_feature_statistics( diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py index 2ed6d8688f..c6cd5959bd 100755 --- a/python/hsfs/core/vector_server.py +++ b/python/hsfs/core/vector_server.py @@ -103,7 +103,7 @@ def __init__( self._inference_helper_col_name = [ feat.name for feat in features if feat.inference_helper_column ] - self._transformed_feature_vector_col_name = None + self._transformed_feature_vector_col_name: List[str] = None self._skip_fg_ids = skip_fg_ids or set() self._serving_keys = serving_keys or [] @@ -1077,9 +1077,9 @@ def default_client(self, default_client: Literal["rest", "sql"]): def transformed_feature_vector_col_name(self): if self._transformed_feature_vector_col_name is None: + self._transformed_feature_vector_col_name = self._feature_vector_col_name for transformation_function in self._transformation_functions: - self._transformed_feature_vector_col_name = ( - self._feature_vector_col_name - + transformation_function.hopsworks_udf.transformation_feature_names + self._transformed_feature_vector_col_name += ( + transformation_function.hopsworks_udf.transformation_features ) return self._transformed_feature_vector_col_name \ No newline at end of file diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py index 42814ab079..6d213f7778 100644 --- a/python/hsfs/engine/python.py +++ b/python/hsfs/engine/python.py @@ -912,14 +912,14 @@ def get_training_data( df = query_obj.read( read_options=read_options, dataframe_type=dataframe_type ) - if training_dataset_version is None: - transformation_function_engine.TransformationFunctionEngine.compute_and_set_feature_statistics( - training_dataset_obj, feature_view_obj, df - ) - else: - transformation_function_engine.TransformationFunctionEngine.get_and_set_feature_statistics( - training_dataset_obj, feature_view_obj, training_dataset_version - ) + # if training_dataset_version is None: + transformation_function_engine.TransformationFunctionEngine.compute_and_set_feature_statistics( + training_dataset_obj, feature_view_obj, df + ) + # else: + # transformation_function_engine.TransformationFunctionEngine.get_and_set_feature_statistics( + # training_dataset_obj, feature_view_obj, training_dataset_version + # ) return self._apply_transformation_function( training_dataset_obj.transformation_functions, df ) @@ -1291,6 +1291,8 @@ def _apply_transformation_function( # Arguments transformation_functions `List[TransformationFunction]` : List of transformation functions. dataset `Union[pd.DataFrame, pl.DataFrame]`: A pandas or polars dataframe. + # Returns + `DataFrame`: A pandas dataframe with the transformed data. # Raises `FeatureStoreException`: If any of the features mentioned in the transformation function is not present in the Feature View. """ diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py index 38867ea81e..f1f6fcb69a 100644 --- a/python/hsfs/engine/spark.py +++ b/python/hsfs/engine/spark.py @@ -23,7 +23,7 @@ import shutil import warnings from datetime import date, datetime, timezone -from typing import Any, List, Optional, TypeVar, Union, TYPE_CHECKING +from typing import Any, List, Optional, TypeVar, Union, TYPE_CHECKING, Dict import avro import numpy as np @@ -31,6 +31,9 @@ import tzlocal if TYPE_CHECKING: + from hsfs.constructor.query import Query + from hsfs.feature_view import FeatureView + from hsfs.training_dataset import TrainingDataset from hsfs.transformation_function import TransformationFunction # in case importing in %%local @@ -545,12 +548,26 @@ def _online_fg_to_avro(self, feature_group, dataframe): def get_training_data( self, - training_dataset, - feature_view_obj, - query_obj, - read_options, - dataframe_type, + training_dataset: TrainingDataset, + feature_view_obj: FeatureView, + query_obj: Query, + read_options: Dict[str, Any], + dataframe_type: str, + training_dataset_version: int = None, ): + """ + Function that creates or retrieves already created the training dataset. + + # Arguments + training_dataset_obj `TrainingDataset`: The training dataset metadata object. + feature_view_obj `FeatureView`: The feature view object for the which the training data is being created. + query_obj `Query`: The query object that contains the query used to create the feature view. + read_options `Dict[str, Any]`: Dictionary that can be used to specify extra parameters for reading data. + dataframe_type `str`: The type of dataframe returned. + training_dataset_version `int`: Version of training data to be retrieved. + # Raises + `ValueError`: If the training dataset statistics could not be retrieved. + """ return self.write_training_dataset( training_dataset, query_obj, @@ -559,6 +576,7 @@ def get_training_data( read_options=read_options, to_df=True, feature_view_obj=feature_view_obj, + training_dataset_version=training_dataset_version, ) def split_labels(self, df, labels, dataframe_type): @@ -581,14 +599,30 @@ def drop_columns(self, df, drop_cols): def write_training_dataset( self, - training_dataset, - query_obj, - user_write_options, - save_mode, - read_options=None, - feature_view_obj=None, - to_df=False, + training_dataset: TrainingDataset, + query_obj: Query, + user_write_options: Dict[str, Any], + save_mode: str, + read_options: Dict[str, Any] = None, + feature_view_obj: FeatureView = None, + to_df: bool = False, + training_dataset_version: Optional[int] = None, ): + """ + Function that creates or retrieves already created the training dataset. + + # Arguments + training_dataset `TrainingDataset`: The training dataset metadata object. + query_obj `Query`: The query object that contains the query used to create the feature view. + user_write_options `Dict[str, Any]`: Dictionary that can be used to specify extra parameters for writing data using spark. + save_mode `str`: Spark save mode to be used while writing data. + read_options `Dict[str, Any]`: Dictionary that can be used to specify extra parameters for reading data. + feature_view_obj `FeatureView`: The feature view object for the which the training data is being created. + to_df `bool`: Return dataframe instead of writing the data. + training_dataset_version `Optional[int]`: Version of training data to be retrieved. + # Raises + `ValueError`: If the training dataset statistics could not be retrieved. + """ write_options = self.write_options( training_dataset.data_format, user_write_options ) @@ -603,14 +637,20 @@ def write_training_dataset( else: raise ValueError("Dataset should be a query.") - transformation_function_engine.TransformationFunctionEngine.add_feature_statistics( + # if training_dataset_version is None: + transformation_function_engine.TransformationFunctionEngine.compute_and_set_feature_statistics( training_dataset, feature_view_obj, dataset ) + # else: + # transformation_function_engine.TransformationFunctionEngine.get_and_set_feature_statistics( + # training_dataset, feature_view_obj, training_dataset_version + # ) + if training_dataset.coalesce: dataset = dataset.coalesce(1) path = training_dataset.location + "/" + training_dataset.name return self._write_training_dataset_single( - training_dataset.transformation_functions, + feature_view_obj.transformation_functions, dataset, training_dataset.storage_connector, training_dataset.data_format, @@ -629,11 +669,22 @@ def write_training_dataset( split_dataset[key] = split_dataset[key].cache() - transformation_function_engine.TransformationFunctionEngine.add_feature_statistics( - training_dataset, feature_view_obj, split_dataset - ) + if training_dataset_version is None: + transformation_function_engine.TransformationFunctionEngine.compute_and_set_feature_statistics( + training_dataset, feature_view_obj, split_dataset + ) + else: + transformation_function_engine.TransformationFunctionEngine.get_and_set_feature_statistics( + training_dataset, feature_view_obj, training_dataset_version + ) + return self._write_training_dataset_splits( - training_dataset, split_dataset, write_options, save_mode, to_df=to_df + training_dataset, + split_dataset, + write_options, + save_mode, + to_df=to_df, + transformation_functions=feature_view_obj.transformation_functions, ) def _split_df(self, query_obj, training_dataset, read_options=None): @@ -785,11 +836,12 @@ def _write_training_dataset_splits( write_options, save_mode, to_df=False, + transformation_functions: List[TransformationFunction] = None, ): for split_name, feature_dataframe in feature_dataframes.items(): split_path = training_dataset.location + "/" + str(split_name) feature_dataframes[split_name] = self._write_training_dataset_single( - training_dataset.transformation_functions, + transformation_functions, feature_dataframe, training_dataset.storage_connector, training_dataset.data_format, @@ -1166,9 +1218,19 @@ def add_cols_to_delta_table(self, feature_group, new_features): ).save(feature_group.location) def _apply_transformation_function( - self, transformation_functions: List[TransformationFunction], dataset + self, transformation_functions: List[TransformationFunction], dataset: DataFrame ): - # generate transformation function expressions + """ + Apply transformation function to the dataframe. + + # Arguments + transformation_functions `List[TransformationFunction]` : List of transformation functions. + dataset `Union[DataFrame]`: A spark dataframe. + # Returns + `DataFrame`: A spark dataframe with the transformed data. + # Raises + `FeatureStoreException`: If any of the features mentioned in the transformation function is not present in the Feature View. + """ transformed_features = set() transformations = [] transformation_features = [] @@ -1180,62 +1242,32 @@ def _apply_transformation_function( dataset.columns ) - # TODO : Add documentation link in exception if missing_features: raise FeatureStoreException( - f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly. Refer .." + f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly." ) transformed_features.update( transformation_function.hopsworks_udf.transformation_features ) - # TODO : Add statistics pandas_udf = hopsworks_udf.get_udf() - output_col_name = f'{hopsworks_udf.function_name}<{"-".join(hopsworks_udf.transformation_features)}>' + output_col_name = hopsworks_udf.output_column_names[0] + transformations.append(pandas_udf) - transformation_features.append(hopsworks_udf.transformation_features) output_col_names.append(output_col_name) + transformation_features.append(hopsworks_udf.transformation_features) - if isinstance(hopsworks_udf.return_type, List): + if len(hopsworks_udf.output_types) > 1: explode_name.append(f"{output_col_name}.*") else: explode_name.append(output_col_name) - def timezone_decorator(func, trans_fn=hopsworks_udf): - if trans_fn.output_type != "TIMESTAMP": - return func - - current_timezone = tzlocal.get_localzone() - - def decorated_func(x): - result = func(x) - if isinstance(result, datetime): - if result.tzinfo is None: - # if timestamp is timezone unaware, make sure it's localized to the system's timezone. - # otherwise, spark will implicitly convert it to the system's timezone. - return result.replace(tzinfo=current_timezone) - else: - # convert to utc, then localize to system's timezone - return result.astimezone(timezone.utc).replace( - tzinfo=current_timezone - ) - return result - - return decorated_func - - # TODO : Timezone aware check see if I need to do also. - # self._spark_session.udf.register( - # fn_registration_name, - # timezone_decorator(transformation_fn.transformation_fn), - # transformation_fn.output_type, - # ) - - # generate non transformation expressions - - # generate entire expression and execute it - - untransformed_columns = set(dataset.columns) - transformed_features + untransformed_columns = [] # Untransformed column maintained as a list since order is imported while selecting features. + for column in dataset.columns: + if column not in transformed_features: + untransformed_columns.append(column) + # Applying transformations transformed_dataset = dataset.select( *untransformed_columns, *[ diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index 34edaf4a64..554a3de9fd 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -175,7 +175,7 @@ def _validate_and_convert_output_types( and output_type not in HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING.values() ): raise FeatureStoreException( - f"Output type {output_type} is not supported. Please refer to DOCUMENTATION to get more information on the supported types." + f"Output type {output_type} is not supported. Please refer to the documentation to get more information on the supported types." ) convert_output_types.append( output_type @@ -364,7 +364,7 @@ def _format_source_code( source_code = source_code.split("\n") # Reconstruct the modified function as a string modified_source = ( - new_signature + "\n" + "\n\t".join(source_code[signature_end_line + 1 :]) + new_signature + "\n\t" + "\n\t".join(source_code[signature_end_line + 1 :]) ) # Define a new function with the modified source code @@ -377,13 +377,13 @@ def _get_output_column_names(self) -> str: # Returns `List[str]`: List of feature names for the transformed columns """ + _BASE_COLUMN_NAME = ( + f'{self.function_name}_{"-".join(self.transformation_features)}_' + ) if len(self.output_types) > 1: - return [ - f'{self.function_name}_{"-".join(self.transformation_features)}_{i}' - for i in range(len(self.output_types)) - ] + return [f"{_BASE_COLUMN_NAME}{i}" for i in range(len(self.output_types))] else: - return [f'{self.function_name}_{"-".join(self.transformation_features)}_'] + return [f"{_BASE_COLUMN_NAME}"] def _create_pandas_udf_return_schema_from_list(self) -> str: """ @@ -395,7 +395,7 @@ def _create_pandas_udf_return_schema_from_list(self) -> str: if len(self.output_types) > 1: return ", ".join( [ - f"{self.output_column_names[i]} {self.output_types[i]}" + f"`{self.output_column_names[i]}` {self.output_types[i]}" for i in range(len(self.output_types)) ] ) @@ -412,20 +412,40 @@ def hopsworksUdf_wrapper(self) -> Callable: # Returns `Callable`: A wrapper function that renames outputs of the User defined function into specified output column names. """ + + # Function to make transformation function time safe. Defined as a string because it has to be dynamically injected into scope to be executed by spark + convert_timstamp_function = """def convert_timezone(date_time_col : pd.Series): + import tzlocal + current_timezone = tzlocal.get_localzone() + if date_time_col.dt.tz is None: + # if timestamp is timezone unaware, make sure it's localized to the system's timezone. + # otherwise, spark will implicitly convert it to the system's timezone. + return date_time_col.dt.tz_localize(str(current_timezone)) + else: + # convert to utc, then localize to system's timezone + return date_time_col.dt.tz_convert('UTC').dt.tz_localize(None).dt.tz_localize(str(current_timezone))""" + # Defining wrapper function that renames the column names to specific names if len(self.output_types) > 1: - code = f"""def renaming_wrapper(*args): - import pandas as pd + code = f"""import pandas as pd +{convert_timstamp_function} +def renaming_wrapper(*args): {self._formatted_function_source} df = {self.function_name}(*args) df = df.rename(columns = {{df.columns[i]: _output_col_names[i] for i in range(len(df.columns))}}) + for col in df: + if pd.api.types.is_datetime64_any_dtype(df[col]): + df[col] = convert_timezone(df[col]) return df""" else: - code = f"""def renaming_wrapper(*args): - import pandas as pd + code = f"""import pandas as pd +{convert_timstamp_function} +def renaming_wrapper(*args): {self._formatted_function_source} df = {self.function_name}(*args) df = df.rename(_output_col_names[0]) + if pd.api.types.is_datetime64_any_dtype(df): + df = convert_timezone(df) return df""" # injecting variables into scope used to execute wrapper function. diff --git a/python/hsfs/transformation_function_attached.py b/python/hsfs/transformation_function_attached.py deleted file mode 100644 index ca4deceddb..0000000000 --- a/python/hsfs/transformation_function_attached.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright 2021. Logical Clocks AB -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from __future__ import annotations - -import humps -from hsfs import transformation_function as transformation_fn - - -class TransformationFunctionAttached: - def __init__( - self, - name, - transformation_function, - type=None, - items=None, - count=None, - href=None, - **kwargs, - ): - self._name = name - self._transformation_function = ( - transformation_fn.TransformationFunction.from_response_json( - transformation_function - ) - if isinstance(transformation_function, dict) - else transformation_function - ) - - @classmethod - def from_response_json(cls, json_dict): - json_decamelized = humps.decamelize(json_dict) - if "count" in json_decamelized: - if json_decamelized["count"] == 0: - return [] - return [cls(**tffn_dto) for tffn_dto in json_decamelized["items"]] - else: - return cls(**json_decamelized) - - def update_from_response_json(self, json_dict): - json_decamelized = humps.decamelize(json_dict) - self.__init__(**json_decamelized) - return self - - @property - def name(self): - """Set feature name.""" - return self._name - - @name.setter - def name(self, name): - self._name = name - - @property - def transformation_function(self): - """Set transformation functions.""" - return self._transformation_function - - @transformation_function.setter - def transformation_function(self, transformation_function): - self._transformation_function = transformation_function diff --git a/python/pyproject.toml b/python/pyproject.toml index 2b3d69db4d..1ad6c8c5f4 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -143,6 +143,7 @@ exclude = [ "site-packages", "venv", "java", + "python/tests/transformations_test_helper/" # transformations_test_helper excluded from fomating and linting because the used formating is required for the test cases ] # Same as Black. diff --git a/python/tests/core/test_arrow_flight_client.py b/python/tests/core/test_arrow_flight_client.py index 0b647aedf1..faa480c6ad 100644 --- a/python/tests/core/test_arrow_flight_client.py +++ b/python/tests/core/test_arrow_flight_client.py @@ -77,9 +77,6 @@ def _arrange_featureview_mocks(self, mocker, backend_fixtures): "hsfs.core.feature_view_engine.FeatureViewEngine.get_batch_query", return_value=fg.select_all(), ) - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions" - ) mocker.patch("hsfs.engine.python.Engine._apply_transformation_function") # required for batch query diff --git a/python/tests/core/test_feature_view_engine.py b/python/tests/core/test_feature_view_engine.py index e50868285d..b1fb7ee08a 100644 --- a/python/tests/core/test_feature_view_engine.py +++ b/python/tests/core/test_feature_view_engine.py @@ -1641,7 +1641,6 @@ def test_get_training_dataset_metadata(self, mocker): # Assert assert mock_fv_api.return_value.get_training_dataset_by_version.call_count == 1 assert result.schema == fv.schema - assert result.transformation_functions == fv.transformation_functions def test_create_training_data_metadata(self, mocker): # Arrange diff --git a/python/tests/core/test_training_dataset_engine.py b/python/tests/core/test_training_dataset_engine.py index a1e28c49ae..5e77445971 100644 --- a/python/tests/core/test_training_dataset_engine.py +++ b/python/tests/core/test_training_dataset_engine.py @@ -23,6 +23,7 @@ ) from hsfs.constructor import query from hsfs.core import training_dataset_engine +from hsfs.hopsworks_udf import hopsworks_udf class TestTrainingDatasetEngine: @@ -111,20 +112,18 @@ def test_save_transformation_functions(self, mocker): feature_store_id = 99 mocker.patch("hsfs.client.get_instance") - mocker.patch( - "hsfs.transformation_function.TransformationFunction._extract_source_code" - ) mocker.patch( "hsfs.core.transformation_function_engine.TransformationFunctionEngine" ) mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") mock_td_api = mocker.patch("hsfs.core.training_dataset_api.TrainingDatasetApi") + @hopsworks_udf(int) def plus_one(a): return a + 1 tf = transformation_function.TransformationFunction( - 1, plus_one, 1, "plus_one", output_type=str + hopsworks_udf=plus_one, featurestore_id=99 ) td = training_dataset.TrainingDataset( diff --git a/python/tests/core/test_transformation_function_engine.py b/python/tests/core/test_transformation_function_engine.py index 29e20f3cac..51dd623ef1 100644 --- a/python/tests/core/test_transformation_function_engine.py +++ b/python/tests/core/test_transformation_function_engine.py @@ -306,7 +306,7 @@ def testFunction1(col1): # Assert assert ( mock_s_engine.return_value.compute_transformation_fn_statistics.call_count - == 1 + == 0 ) def test_compute_and_set_feature_statistics_train_test_split(self, mocker): @@ -364,7 +364,7 @@ def testFunction1(col1): # Assert assert ( mock_s_engine.return_value.compute_transformation_fn_statistics.call_count - == 1 + == 0 ) def test_get_and_set_feature_statistics_no_statistics_required(self, mocker): diff --git a/python/tests/engine/test_python.py b/python/tests/engine/test_python.py index 88ff95a34b..55267cc7ce 100644 --- a/python/tests/engine/test_python.py +++ b/python/tests/engine/test_python.py @@ -3234,7 +3234,7 @@ def test_add_file(self): def test_apply_transformation_function_pandas(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") - + engine._engine_type = "python" python_engine = python.Engine() @hopsworks_udf(int) @@ -3271,10 +3271,98 @@ def plus_one(col1): assert result["plus_one_tf_name_"][0] == 2 assert result["plus_one_tf_name_"][1] == 3 + def test_apply_transformation_function_multiple_output(self, mocker): + # Arrange + mocker.patch("hsfs.client.get_instance") + engine._engine_type = "python" + python_engine = python.Engine() + + @hopsworks_udf([int, int]) + def plus_two(col1): + return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col1 + 2}) + + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, + ) + + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), + featurestore_id=99, + transformation_functions=[plus_two], + ) + + df = pd.DataFrame(data={"col1": [1, 2], "col2": [10, 11]}) + + # Act + result = python_engine._apply_transformation_function( + transformation_functions=fv.transformation_functions, dataset=df + ) + + # Assert + assert all(result.columns == ["col2", "plus_two_col1_0", "plus_two_col1_1"]) + assert len(result) == 2 + assert result["plus_two_col1_0"][0] == 2 + assert result["plus_two_col1_0"][1] == 3 + assert result["plus_two_col1_1"][0] == 3 + assert result["plus_two_col1_1"][1] == 4 + + def test_apply_transformation_function_multiple_input_output(self, mocker): + # Arrange + mocker.patch("hsfs.client.get_instance") + + engine._engine_type = "python" + python_engine = python.Engine() + + @hopsworks_udf([int, int]) + def plus_two(col1, col2): + return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2}) + + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, + ) + + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), + featurestore_id=99, + transformation_functions=[plus_two], + ) + + df = pd.DataFrame(data={"col1": [1, 2], "col2": [10, 11]}) + + # Act + result = python_engine._apply_transformation_function( + transformation_functions=fv.transformation_functions, dataset=df + ) + + # Assert + assert all(result.columns == ["plus_two_col1-col2_0", "plus_two_col1-col2_1"]) + assert len(result) == 2 + assert result["plus_two_col1-col2_0"][0] == 2 + assert result["plus_two_col1-col2_0"][1] == 3 + assert result["plus_two_col1-col2_1"][0] == 12 + assert result["plus_two_col1-col2_1"][1] == 13 + def test_apply_transformation_function_polars(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") + engine._engine_type = "python" python_engine = python.Engine() @hopsworks_udf(int) diff --git a/python/tests/engine/test_spark.py b/python/tests/engine/test_spark.py index 5c7d76add0..09300059f3 100644 --- a/python/tests/engine/test_spark.py +++ b/python/tests/engine/test_spark.py @@ -23,6 +23,7 @@ expectation_suite, feature, feature_group, + feature_view, storage_connector, training_dataset, training_dataset_feature, @@ -33,6 +34,7 @@ from hsfs.constructor import hudi_feature_group_alias, query from hsfs.core import training_dataset_engine from hsfs.engine import spark +from hsfs.hopsworks_udf import hopsworks_udf from hsfs.training_dataset_feature import TrainingDatasetFeature from pyspark.sql import DataFrame from pyspark.sql.types import ( @@ -1729,9 +1731,6 @@ def test_write_training_dataset(self, mocker): mock_spark_engine_convert_to_default_dataframe = mocker.patch( "hsfs.engine.spark.Engine.convert_to_default_dataframe" ) - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions" - ) mock_spark_engine_write_training_dataset_single = mocker.patch( "hsfs.engine.spark.Engine._write_training_dataset_single" ) @@ -1806,7 +1805,24 @@ def test_write_training_dataset_to_df(self, mocker, backend_fixtures): statistics_config=None, training_dataset_type=training_dataset.TrainingDataset.IN_MEMORY, extra_filter=None, - transformation_functions={}, + ) + + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, + ) + + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), + featurestore_id=99, + transformation_functions=[], ) # Act @@ -1816,7 +1832,7 @@ def test_write_training_dataset_to_df(self, mocker, backend_fixtures): user_write_options={}, save_mode=training_dataset_engine.TrainingDatasetEngine.OVERWRITE, read_options={}, - feature_view_obj=None, + feature_view_obj=fv, to_df=True, ) @@ -1846,6 +1862,24 @@ def test_write_training_dataset_split_to_df(self, mocker, backend_fixtures): query_df = spark_engine._spark_session.createDataFrame(df) mock_query_read.side_effect = [query_df] + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, + ) + + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), + featurestore_id=99, + transformation_functions=[], + ) + td = training_dataset.TrainingDataset( name="test", version=None, @@ -1865,7 +1899,6 @@ def test_write_training_dataset_split_to_df(self, mocker, backend_fixtures): training_dataset_type=training_dataset.TrainingDataset.IN_MEMORY, extra_filter=None, seed=1, - transformation_functions={}, ) # Act @@ -1875,7 +1908,7 @@ def test_write_training_dataset_split_to_df(self, mocker, backend_fixtures): user_write_options={}, save_mode=training_dataset_engine.TrainingDatasetEngine.OVERWRITE, read_options={}, - feature_view_obj=None, + feature_view_obj=fv, to_df=True, ) @@ -1897,9 +1930,6 @@ def test_write_training_dataset_query(self, mocker): mock_spark_engine_convert_to_default_dataframe = mocker.patch( "hsfs.engine.spark.Engine.convert_to_default_dataframe" ) - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions" - ) mock_spark_engine_write_training_dataset_single = mocker.patch( "hsfs.engine.spark.Engine._write_training_dataset_single" ) @@ -1910,6 +1940,24 @@ def test_write_training_dataset_query(self, mocker): spark_engine = spark.Engine() + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, + ) + + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), + featurestore_id=99, + transformation_functions=[], + ) + td = training_dataset.TrainingDataset( name="test", version=1, @@ -1927,7 +1975,7 @@ def test_write_training_dataset_query(self, mocker): user_write_options=None, save_mode=None, read_options=None, - feature_view_obj=None, + feature_view_obj=fv, to_df=None, ) @@ -1948,9 +1996,6 @@ def test_write_training_dataset_query_coalesce(self, mocker): mock_spark_engine_convert_to_default_dataframe = mocker.patch( "hsfs.engine.spark.Engine.convert_to_default_dataframe" ) - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions" - ) mock_spark_engine_write_training_dataset_single = mocker.patch( "hsfs.engine.spark.Engine._write_training_dataset_single" ) @@ -1961,6 +2006,24 @@ def test_write_training_dataset_query_coalesce(self, mocker): spark_engine = spark.Engine() + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, + ) + + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), + featurestore_id=99, + transformation_functions=[], + ) + td = training_dataset.TrainingDataset( name="test", version=1, @@ -1979,7 +2042,7 @@ def test_write_training_dataset_query_coalesce(self, mocker): user_write_options=None, save_mode=None, read_options=None, - feature_view_obj=None, + feature_view_obj=fv, to_df=None, ) @@ -2000,9 +2063,6 @@ def test_write_training_dataset_td_splits(self, mocker): mock_spark_engine_convert_to_default_dataframe = mocker.patch( "hsfs.engine.spark.Engine.convert_to_default_dataframe" ) - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions" - ) mock_spark_engine_write_training_dataset_single = mocker.patch( "hsfs.engine.spark.Engine._write_training_dataset_single" ) @@ -2013,6 +2073,24 @@ def test_write_training_dataset_td_splits(self, mocker): spark_engine = spark.Engine() + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, + ) + + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), + featurestore_id=99, + transformation_functions=[], + ) + td = training_dataset.TrainingDataset( name="test", version=1, @@ -2034,7 +2112,7 @@ def test_write_training_dataset_td_splits(self, mocker): user_write_options=None, save_mode=None, read_options=None, - feature_view_obj=None, + feature_view_obj=fv, to_df=None, ) @@ -2056,9 +2134,6 @@ def test_write_training_dataset_td_splits_coalesce(self, mocker): mock_spark_engine_convert_to_default_dataframe = mocker.patch( "hsfs.engine.spark.Engine.convert_to_default_dataframe" ) - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions" - ) mock_spark_engine_write_training_dataset_single = mocker.patch( "hsfs.engine.spark.Engine._write_training_dataset_single" ) @@ -2069,6 +2144,24 @@ def test_write_training_dataset_td_splits_coalesce(self, mocker): spark_engine = spark.Engine() + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, + ) + + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), + featurestore_id=99, + transformation_functions=[], + ) + td = training_dataset.TrainingDataset( name="test", version=1, @@ -2091,7 +2184,7 @@ def test_write_training_dataset_td_splits_coalesce(self, mocker): user_write_options=None, save_mode=None, read_options=None, - feature_view_obj=None, + feature_view_obj=fv, to_df=None, ) @@ -2575,20 +2668,15 @@ def test_write_training_dataset_splits(self, mocker): spark_engine = spark.Engine() - def plus_one(a) -> int: - return a + 1 + @hopsworks_udf(int) + def plus_one(col1): + return col1 + 1 tf = transformation_function.TransformationFunction( featurestore_id=99, - transformation_fn=plus_one, - builtin_source_code="", - output_type="int", + hopsworks_udf=plus_one, ) - transformation_fn_dict = dict() - - transformation_fn_dict["col_0"] = tf - f = training_dataset_feature.TrainingDatasetFeature( name="col_0", type=IntegerType(), index=0 ) @@ -2603,7 +2691,6 @@ def plus_one(a) -> int: data_format="CSV", featurestore_id=99, splits={}, - transformation_functions=transformation_fn_dict, features=features, ) @@ -2614,6 +2701,7 @@ def plus_one(a) -> int: write_options=None, save_mode=None, to_df=False, + transformation_functions=[tf("col_0")], ) # Assert @@ -2629,14 +2717,13 @@ def test_write_training_dataset_splits_to_df(self, mocker): spark_engine = spark.Engine() - def plus_one(a) -> int: - return a + 1 + @hopsworks_udf(int) + def plus_one(col1): + return col1 + 1 tf = transformation_function.TransformationFunction( featurestore_id=99, - transformation_fn=plus_one, - builtin_source_code="", - output_type="int", + hopsworks_udf=plus_one, ) transformation_fn_dict = dict() @@ -2668,6 +2755,7 @@ def plus_one(a) -> int: write_options=None, save_mode=None, to_df=True, + transformation_functions=[tf("col_0")], ) # Assert @@ -4234,42 +4322,100 @@ def test_save_empty_dataframe(self, mocker): assert mock_spark_engine_save_dataframe.call_count == 1 assert mock_spark_table.call_count == 1 - def test_apply_transformation_function(self, mocker): + def test_apply_transformation_function_single_output(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") - + engine._engine_type = "spark" spark_engine = spark.Engine() - def plus_one(a) -> int: - return a + 1 + @hopsworks_udf(int) + def plus_one(col1): + return col1 + 1 tf = transformation_function.TransformationFunction( + 99, + hopsworks_udf=plus_one, + ) + + f = feature.Feature(name="col_0", type=IntegerType(), index=0) + f1 = feature.Feature(name="col_1", type=StringType(), index=1) + f2 = feature.Feature(name="col_2", type=BooleanType(), index=1) + features = [f, f1, f2] + fg1 = feature_group.FeatureGroup( + name="test1", + version=1, featurestore_id=99, - transformation_fn=plus_one, - builtin_source_code="", - output_type="long", + primary_key=[], + partition_key=[], + features=features, + id=11, + stream=False, + ) + fv = feature_view.FeatureView( + name="test", + featurestore_id=99, + query=fg1.select_all(), + transformation_functions=[tf("col_0")], ) - transformation_fn_dict = dict() + d = {"col_0": [1, 2], "col_1": ["test_1", "test_2"], "col_2": [True, False]} + df = pd.DataFrame(data=d) - transformation_fn_dict["col_0"] = tf + spark_df = spark_engine._spark_session.createDataFrame(df) - f = training_dataset_feature.TrainingDatasetFeature( - name="col_0", type=IntegerType(), index=0 + expected_df = pd.DataFrame( + data={ + "col_1": ["test_1", "test_2"], + "col_2": [True, False], + "plus_one_col_0_": [2, 3], + } + ) # todo why it doesnt return int? + + expected_spark_df = spark_engine._spark_session.createDataFrame(expected_df) + + # Act + result = spark_engine._apply_transformation_function( + transformation_functions=fv.transformation_functions, + dataset=spark_df, ) - f1 = training_dataset_feature.TrainingDatasetFeature( - name="col_1", type=StringType(), index=1 + # Assert + assert result.schema == expected_spark_df.schema + assert result.collect() == expected_spark_df.collect() + + def test_apply_transformation_function_multiple_output(self, mocker): + # Arrange + mocker.patch("hsfs.client.get_instance") + engine._engine_type = "spark" + spark_engine = spark.Engine() + + @hopsworks_udf([int, int]) + def plus_two(col1): + return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col1 + 2}) + + tf = transformation_function.TransformationFunction( + 99, + hopsworks_udf=plus_two, ) - features = [f, f1] - td = training_dataset.TrainingDataset( - name="test", + f = feature.Feature(name="col_0", type=IntegerType(), index=0) + f1 = feature.Feature(name="col_1", type=StringType(), index=1) + f2 = feature.Feature(name="col_2", type=BooleanType(), index=1) + features = [f, f1, f2] + fg1 = feature_group.FeatureGroup( + name="test1", version=1, - data_format="CSV", featurestore_id=99, - splits={}, + primary_key=[], + partition_key=[], features=features, - transformation_functions=transformation_fn_dict, + id=11, + stream=False, + ) + fv = feature_view.FeatureView( + name="test", + featurestore_id=99, + query=fg1.select_all(), + transformation_functions=[tf("col_0")], ) d = {"col_0": [1, 2], "col_1": ["test_1", "test_2"], "col_2": [True, False]} @@ -4279,9 +4425,10 @@ def plus_one(a) -> int: expected_df = pd.DataFrame( data={ - "col_0": [2, 3], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "plus_two_col_0_0": [2, 3], + "plus_two_col_0_1": [3, 4], } ) # todo why it doesnt return int? @@ -4289,10 +4436,69 @@ def plus_one(a) -> int: # Act result = spark_engine._apply_transformation_function( - transformation_functions=td.transformation_functions, + transformation_functions=fv.transformation_functions, dataset=spark_df, ) + # Assert + assert result.schema == expected_spark_df.schema + assert result.collect() == expected_spark_df.collect() + + def test_apply_transformation_function_multiple_input_output(self, mocker): + # Arrange + mocker.patch("hsfs.client.get_instance") + engine._engine_type = "spark" + spark_engine = spark.Engine() + + @hopsworks_udf([int, int]) + def test(col1, col2): + return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2}) + + tf = transformation_function.TransformationFunction( + 99, + hopsworks_udf=test, + ) + + f = feature.Feature(name="col_0", type=IntegerType(), index=0) + f1 = feature.Feature(name="col_1", type=StringType(), index=1) + f2 = feature.Feature(name="col_2", type=IntegerType(), index=1) + features = [f, f1, f2] + fg1 = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=features, + id=11, + stream=False, + ) + fv = feature_view.FeatureView( + name="test", + featurestore_id=99, + query=fg1.select_all(), + transformation_functions=[tf("col_0", "col_2")], + ) + d = {"col_0": [1, 2], "col_1": ["test_1", "test_2"], "col_2": [10, 11]} + df = pd.DataFrame(data=d) + + spark_df = spark_engine._spark_session.createDataFrame(df) + + expected_df = pd.DataFrame( + data={ + "col_1": ["test_1", "test_2"], + "test_col_0-col_2_0": [2, 3], + "test_col_0-col_2_1": [12, 13], + } + ) # todo why it doesnt return int? + + expected_spark_df = spark_engine._spark_session.createDataFrame(expected_df) + + # Act + result = spark_engine._apply_transformation_function( + transformation_functions=fv.transformation_functions, + dataset=spark_df, + ) # Assert assert result.schema == expected_spark_df.schema assert result.collect() == expected_spark_df.collect() diff --git a/python/tests/fixtures/backend_fixtures.py b/python/tests/fixtures/backend_fixtures.py index 34a2c9e594..5a7029172f 100644 --- a/python/tests/fixtures/backend_fixtures.py +++ b/python/tests/fixtures/backend_fixtures.py @@ -56,7 +56,6 @@ "training_dataset_feature", "training_dataset", "training_dataset_split", - "transformation_function_attached", "transformation_function", "user", "validation_report", diff --git a/python/tests/pyproject.toml b/python/tests/pyproject.toml index 15a77ff4fd..3d36a4588e 100644 --- a/python/tests/pyproject.toml +++ b/python/tests/pyproject.toml @@ -8,6 +8,8 @@ ignore = [ # Allow fix for all enabled rules (when `--fix`) is provided. fixable = ["ALL"] unfixable = [] +# transformations_test_helper excluded from fomating and linting because the used formating is required for the test cases +exclude = ["transformations_test_helper/"] # Allow unused variables when underscore-prefixed. dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" @@ -29,3 +31,9 @@ skip-magic-trailing-comma = false # Like Black, automatically detect the appropriate line ending. line-ending = "auto" + +[tool.pytest.ini_options] +pythonpath = [ + ".", "tests" +] +addopts = "--ignore=python/tests/transformations_test_helper/" diff --git a/python/tests/test_transformation_function_attached.py b/python/tests/test_transformation_function_attached.py deleted file mode 100644 index 85effdd06e..0000000000 --- a/python/tests/test_transformation_function_attached.py +++ /dev/null @@ -1,88 +0,0 @@ -# -# Copyright 2022 Hopsworks AB -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - - -from hsfs import transformation_function, transformation_function_attached - - -class TestTransformationFunctionAttached: - def test_from_response_json(self, backend_fixtures): - # Arrange - json = backend_fixtures["transformation_function_attached"]["get"]["response"] - - # Act - tf_attached = transformation_function_attached.TransformationFunctionAttached.from_response_json( - json - ) - - # Assert - assert tf_attached.name == "test_name" - assert isinstance( - tf_attached.transformation_function, - transformation_function.TransformationFunction, - ) - - def test_from_response_json_basic_info(self, backend_fixtures): - # Arrange - json = backend_fixtures["transformation_function_attached"]["get_basic_info"][ - "response" - ] - - # Act - tf_attached = transformation_function_attached.TransformationFunctionAttached.from_response_json( - json - ) - - # Assert - assert tf_attached.name == "test_name" - assert isinstance( - tf_attached.transformation_function, - transformation_function.TransformationFunction, - ) - - def test_from_response_json_list(self, backend_fixtures): - # Arrange - json = backend_fixtures["transformation_function_attached"]["get_list"][ - "response" - ] - - # Act - tf_attached_list = transformation_function_attached.TransformationFunctionAttached.from_response_json( - json - ) - - # Assert - assert len(tf_attached_list) == 1 - tf_attached = tf_attached_list[0] - assert tf_attached.name == "test_name" - assert isinstance( - tf_attached.transformation_function, - transformation_function.TransformationFunction, - ) - - def test_from_response_json_list_empty(self, backend_fixtures): - # Arrange - json = backend_fixtures["transformation_function_attached"]["get_list_empty"][ - "response" - ] - - # Act - tf_attached_list = transformation_function_attached.TransformationFunctionAttached.from_response_json( - json - ) - - # Assert - assert len(tf_attached_list) == 0 From 5608c18d15d9b3665bb374a48dbd9c2cb1debdbc Mon Sep 17 00:00:00 2001 From: manu-sj Date: Mon, 13 May 2024 13:26:15 +0200 Subject: [PATCH 19/58] all unit tests working --- python/hsfs/builtin_transformations.py | 67 ++ .../core/builtin_transformation_function.py | 107 --- python/hsfs/feature_store.py | 2 +- python/hsfs/feature_view.py | 7 +- python/hsfs/hopsworks_udf.py | 70 +- python/hsfs/transformation_function.py | 5 +- ...t_python_spark_transformation_functions.py | 710 +++++++++++------- .../tests/fixtures/feature_view_fixtures.json | 27 +- .../transformation_function_fixtures.json | 18 + python/tests/test_transformation_function.py | 24 + 10 files changed, 592 insertions(+), 445 deletions(-) create mode 100644 python/hsfs/builtin_transformations.py delete mode 100644 python/hsfs/core/builtin_transformation_function.py diff --git a/python/hsfs/builtin_transformations.py b/python/hsfs/builtin_transformations.py new file mode 100644 index 0000000000..d17ae6f1fa --- /dev/null +++ b/python/hsfs/builtin_transformations.py @@ -0,0 +1,67 @@ +# +# Copyright 2024 Hopsworks AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pandas as pd +from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics +from hsfs.hopsworks_udf import hopsworks_udf + + +@hopsworks_udf(float) +def min_max_scaler(feature: pd.Series, statistics_feature) -> pd.Series: + return (feature - statistics_feature.min) / ( + statistics_feature.max - statistics_feature.min + ) + + +@hopsworks_udf(float) +def standard_scaler( + feature: pd.Series, statistics_feature: FeatureDescriptiveStatistics +) -> pd.Series: + return (feature - statistics_feature.mean) / statistics_feature.stddev + + +@hopsworks_udf(float) +def robust_scaler( + feature: pd.Series, statistics_feature: FeatureDescriptiveStatistics +) -> pd.Series: + return (feature - statistics_feature.percentiles[49]) / ( + statistics_feature.percentiles[74] - statistics_feature.percentiles[24] + ) + + +# @hopsworks_udf(int) +def label_encoder( + feature: pd.Series, statistics_feature: FeatureDescriptiveStatistics +) -> pd.Series: + unique_data = [ + value for value in statistics_feature.extended_statistics["unique_values"] + ] + value_to_index = {value: index for index, value in enumerate(unique_data)} + return pd.Series([value_to_index[data] for data in feature]) + + +def one_hot_encoder( + feature: pd.Series, statistics_feature: FeatureDescriptiveStatistics +) -> pd.Series: + unique_data = [ + value for value in statistics_feature.extended_statistics["unique_values"] + ] + print(statistics_feature.extended_statistics["unique_values"]) + one_hot = pd.get_dummies(feature, dtype="bool") + for data in unique_data: + if data not in one_hot: + one_hot[data] = False + return one_hot diff --git a/python/hsfs/core/builtin_transformation_function.py b/python/hsfs/core/builtin_transformation_function.py deleted file mode 100644 index 7ef5b63555..0000000000 --- a/python/hsfs/core/builtin_transformation_function.py +++ /dev/null @@ -1,107 +0,0 @@ -# -# Copyright 2021 Logical Clocks AB -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from __future__ import annotations - -from typing import List - -from hsfs.client.exceptions import FeatureStoreException -from hsfs.core import feature_descriptive_statistics as fds - - -class BuiltInTransformationFunction: - def __init__(self, method): - self._method = method.lower() - - @staticmethod - def min_max_scaler_stats( - feature_descriptive_stats: List[fds.FeatureDescriptiveStatistics], - feature_name: str, - ): - min_value = None - max_value = None - for stats in feature_descriptive_stats: - if stats.feature_name == feature_name: - if stats.feature_type not in ["Integral", "Fractional", "Decimal"]: - raise ValueError("Can't compute min_max_scaler for this type") - min_value = stats.min - max_value = stats.max - - if min_value is None or max_value is None: - raise FeatureStoreException( - "Feature {feature_name:} doesn't have minimum and/or maximum values computed. Thus can't use " - "min_max_scaler method".format(feature_name=feature_name) - ) - return min_value, max_value - - @staticmethod - def standard_scaler_stats( - feature_descriptive_stats: List[fds.FeatureDescriptiveStatistics], - feature_name: str, - ): - mean = None - std_dev = None - for stats in feature_descriptive_stats: - if stats.feature_name == feature_name: - if stats.feature_type not in ["Integral", "Fractional", "Decimal"]: - raise ValueError("Can't compute standard_scaler for this type") - mean = stats.mean - std_dev = stats.stddev - - if mean is None or std_dev is None: - raise FeatureStoreException( - "Feature {feature_name:} doesn't have mean and/or standard deviation computed. Thus can't use " - "standard_scaler method".format(feature_name=feature_name) - ) - return mean, std_dev - - @staticmethod - def robust_scaler_stats( - feature_descriptive_stats: List[fds.FeatureDescriptiveStatistics], - feature_name: str, - ): - percentiles = None - for stats in feature_descriptive_stats: - if stats.feature_name == feature_name: - if stats.feature_type not in ["Integral", "Fractional", "Decimal"]: - raise ValueError("Can't compute robust_scaler for this type") - if stats.percentiles is not None and len(stats.percentiles) > 0: - percentiles = stats.percentiles - - if percentiles is None: - raise FeatureStoreException( - "Feature {feature_name:} doesn't have mean and/or standard deviation computed. Thus can't use " - "standard_scaler method".format(feature_name=feature_name) - ) - return percentiles - - @staticmethod - def encoder_stats( - feature_descriptive_stats: List[fds.FeatureDescriptiveStatistics], - feature_name: str, - ): - for stats in feature_descriptive_stats: - if ( - stats.feature_name == feature_name - and stats.extended_statistics is not None - and "unique_values" in stats.extended_statistics - ): - unique_data = [ - value for value in stats.extended_statistics["unique_values"] - ] - value_to_index = dict( - (value, index) for index, value in enumerate(unique_data) - ) - return value_to_index diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py index 10f6a269bc..41d1a754ff 100644 --- a/python/hsfs/feature_store.py +++ b/python/hsfs/feature_store.py @@ -1316,7 +1316,7 @@ def plus_one(value): """ return TransformationFunction( featurestore_id=self._id, - transformation_fn=transformation_function, + hopsworks_udf=transformation_function, version=version, ) diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py index 7c8a914dd4..5b90fabfc2 100644 --- a/python/hsfs/feature_view.py +++ b/python/hsfs/feature_view.py @@ -3422,9 +3422,10 @@ def from_response_json(cls, json_dict: Dict[str, Any]) -> "FeatureView": description=json_decamelized.get("description", None), featurestore_name=json_decamelized.get("featurestore_name", None), serving_keys=serving_keys, - transformation_functions=TransformationFunction.from_response_json( - transformation_functions - ) + transformation_functions=[ + TransformationFunction.from_response_json(transformation_function) + for transformation_function in transformation_functions + ] if transformation_functions else [], ) diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index 554a3de9fd..9ed60ead0d 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -144,8 +144,10 @@ def __init__( else transformation_features ) - self._formatted_function_source = HopsworksUdf._format_source_code( - self._function_source, self._transformation_features + self._formatted_function_source, self._module_imports = ( + HopsworksUdf._format_source_code( + self._function_source, self._transformation_features + ) ) self._output_column_names: List[str] = self._get_output_column_names() @@ -214,30 +216,6 @@ def _get_module_imports(path: str) -> List[str]: imports.append(import_line) return imports - @staticmethod - def _get_module_path(module_name: str) -> str: - """ - Function that returns the path to the source code of a python module. - - Cannot extract path if the module is defined in a jupyter notebook since it is currently impossible find the path of a jupyter notebook.(https://github.com/ipython/ipython/issues/10123) - - # Arguments - path: `str`. Path to python file from which imports are to be extracted. - # Raises - AttributeError : If the provided module is defined in a jupyter notebook. - # Returns - `str`: a string that contains the path to the module - """ - - def _get_module_path(module): - return module.__file__ - - module_path = {} - exec( - f'import {module_name}\nmodule_path["path"] = _get_module_path({module_name})' - ) - return module_path["path"] - @staticmethod def _extract_source_code(udf_function: Callable) -> str: """ @@ -252,12 +230,12 @@ def _extract_source_code(udf_function: Callable) -> str: """ try: module_imports = HopsworksUdf._get_module_imports( - HopsworksUdf._get_module_path(udf_function.__module__) + inspect.getfile(udf_function) ) - except AttributeError: + except FileNotFoundError: module_imports = [""] warnings.warn( - "Passed UDF defined in a Jupyter notebook. Cannot extract import dependencies from a notebook. Please make sure to import all dependencies for the UDF inside the function.", + "Cannot extract imported dependencies for the function module. Please make sure to import all dependencies for the UDF inside the function.", stacklevel=2, ) @@ -340,7 +318,7 @@ def _extract_function_arguments(source_code: str) -> List[TransformationFeature] @staticmethod def _format_source_code( source_code: str, transformation_features: List[TransformationFeature] - ) -> str: + ) -> Tuple[str, str]: """ Function that parses the existing source code to remove statistics parameter and remove all decorators and type hints from the function source code. @@ -348,13 +326,13 @@ def _format_source_code( source_code: `str`. Source code of a function. transformation_features `List[TransformationFeature]`: List of transformation features provided in the function argument. # Returns - `str`: Source code that does not contain any decorators, type hints or statistics parameters. + `Tuple[str, str]`: Tuple that contains Source code that does not contain any decorators, type hints or statistics parameters and the module imports """ _, signature, _, signature_end_line = HopsworksUdf._parse_function_signature( source_code ) - + module_imports = source_code.split("@")[0] arg_list = [feature.feature_name for feature in transformation_features] # Reconstruct the function signature @@ -367,8 +345,7 @@ def _format_source_code( new_signature + "\n\t" + "\n\t".join(source_code[signature_end_line + 1 :]) ) - # Define a new function with the modified source code - return modified_source + return modified_source, module_imports def _get_output_column_names(self) -> str: """ @@ -423,11 +400,14 @@ def hopsworksUdf_wrapper(self) -> Callable: return date_time_col.dt.tz_localize(str(current_timezone)) else: # convert to utc, then localize to system's timezone - return date_time_col.dt.tz_convert('UTC').dt.tz_localize(None).dt.tz_localize(str(current_timezone))""" + return date_time_col.dt.tz_localize(None).dt.tz_localize(str(current_timezone))""" # Defining wrapper function that renames the column names to specific names if len(self.output_types) > 1: - code = f"""import pandas as pd + code = ( + self._module_imports + + "\n" + + f"""import pandas as pd {convert_timstamp_function} def renaming_wrapper(*args): {self._formatted_function_source} @@ -437,8 +417,12 @@ def renaming_wrapper(*args): if pd.api.types.is_datetime64_any_dtype(df[col]): df[col] = convert_timezone(df[col]) return df""" + ) else: - code = f"""import pandas as pd + code = ( + self._module_imports + + "\n" + + f"""import pandas as pd {convert_timstamp_function} def renaming_wrapper(*args): {self._formatted_function_source} @@ -447,13 +431,13 @@ def renaming_wrapper(*args): if pd.api.types.is_datetime64_any_dtype(df): df = convert_timezone(df) return df""" - + ) + print(code) # injecting variables into scope used to execute wrapper function. scope = __import__("__main__").__dict__ if self.transformation_statistics is not None: scope.update(self.transformation_statistics) scope.update({"_output_col_names": self.output_column_names}) - # executing code exec(code, scope) @@ -524,12 +508,8 @@ def to_dict(self) -> Dict[str, Any]: `Dict`: Dictionary that contains all data required to json serialize the object. """ return { - "sourceCode": self._original_code, - "outputTypes": ",".join( - [python_type.__name__ for python_type in self.output_types] - ) - if isinstance(self.output_types, List) - else self.output_types.__name__, + "sourceCode": self._function_source, + "outputTypes": self.output_types, "transformationFeatures": self.transformation_features, "name": self._function_name, } diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py index 4e23853c73..b6ef060cb9 100644 --- a/python/hsfs/transformation_function.py +++ b/python/hsfs/transformation_function.py @@ -154,7 +154,10 @@ def from_response_json( tffn_dto["hopsworks_udf"] = HopsworksUdf.from_response_json( tffn_dto["hopsworks_udf"] ) - return [cls(**tffn_dto) for tffn_dto in json_decamelized["items"]] + if json_decamelized["count"] == 1: + return cls(**json_decamelized["items"][0]) + else: + return [cls(**tffn_dto) for tffn_dto in json_decamelized["items"]] else: if json_decamelized.get("hopsworks_udf", False): json_decamelized["hopsworks_udf"] = HopsworksUdf.from_response_json( diff --git a/python/tests/engine/test_python_spark_transformation_functions.py b/python/tests/engine/test_python_spark_transformation_functions.py index 0e25037751..f5763ea548 100644 --- a/python/tests/engine/test_python_spark_transformation_functions.py +++ b/python/tests/engine/test_python_spark_transformation_functions.py @@ -18,24 +18,25 @@ import datetime import statistics -import numpy as np import pandas as pd import pytest -import pytz import tzlocal from hsfs import ( + engine, training_dataset, training_dataset_feature, transformation_function, ) +from hsfs.client.exceptions import FeatureStoreException from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics -from hsfs.core.transformation_function_engine import TransformationFunctionEngine from hsfs.engine import python, spark +from hsfs.hopsworks_udf import HopsworksUdf, hopsworks_udf from pyspark.sql.types import ( BooleanType, DateType, DoubleType, IntegerType, + LongType, StringType, StructField, StructType, @@ -44,27 +45,7 @@ class TestPythonSparkTransformationFunctions: - def _create_training_dataset( - self, tf_fun, output_type=None, name=None, col="col_0" - ): - if isinstance(tf_fun, str): - tf = transformation_function.TransformationFunction( - name=name, - featurestore_id=99, - transformation_fn=None, - source_code_content=tf_fun, - output_type=output_type, - ) - else: - tf = transformation_function.TransformationFunction( - featurestore_id=99, - transformation_fn=tf_fun, - builtin_source_code=None, - output_type=output_type, - ) - transformation_fn_dict = dict() - transformation_fn_dict[col] = tf - + def _create_training_dataset(self): f = training_dataset_feature.TrainingDatasetFeature( name="col_0", type=IntegerType(), index=0 ) @@ -83,18 +64,18 @@ def _create_training_dataset( featurestore_id=99, splits={}, features=features, - transformation_functions=transformation_fn_dict, ) return td - def _validate_on_python_engine(self, td, df, expected_df): + def _validate_on_python_engine(self, td, df, expected_df, transformation_functions): # Arrange + engine._engine_type = "python" python_engine = python.Engine() # Act result = python_engine._apply_transformation_function( - transformation_functions=td.transformation_functions, + transformation_functions=transformation_functions, dataset=df, ) @@ -102,13 +83,16 @@ def _validate_on_python_engine(self, td, df, expected_df): assert list(result.dtypes) == list(expected_df.dtypes) assert result.equals(expected_df) - def _validate_on_spark_engine(self, td, spark_df, expected_spark_df): + def _validate_on_spark_engine( + self, td, spark_df, expected_spark_df, transformation_functions + ): # Arrange + engine._engine_type = "spark" spark_engine = spark.Engine() # Act result = spark_engine._apply_transformation_function( - transformation_functions=td.transformation_functions, + transformation_functions=transformation_functions, dataset=spark_df, ) @@ -116,9 +100,10 @@ def _validate_on_spark_engine(self, td, spark_df, expected_spark_df): assert result.schema == expected_spark_df.schema assert result.collect() == expected_spark_df.collect() - def test_apply_builtin_minmax(self, mocker): + def test_apply_builtin_minmax_from_backend(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") + mocker.patch("hsfs.core.statistics_engine.StatisticsEngine._save_statistics") spark_engine = spark.Engine() schema = StructType( @@ -139,16 +124,16 @@ def test_apply_builtin_minmax(self, mocker): expected_schema = StructType( [ - StructField("col_0", DoubleType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("min_max_scaler_col_0_", DoubleType(), True), ] ) expected_df = pd.DataFrame( data={ - "col_0": [0.5, 1.0], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "min_max_scaler_col_0_": [0.0, 1.0], } ) expected_spark_df = spark_engine._spark_session.createDataFrame( @@ -156,34 +141,43 @@ def test_apply_builtin_minmax(self, mocker): ) # Arrange - tf_fun = ( - '{"module_imports": "from datetime import datetime", "transformer_code": ' - '"def min_max_scaler(value, min_value,max_value):\\n if value is None:\\n ' - "return None\\n else:\\n try:\\n return (value - min_value) / (max_value - min_value)\\n" - ' except ZeroDivisionError:\\n return 0\\n"}' - ) - - td = self._create_training_dataset(tf_fun, "DOUBLE", "min_max_scaler") - - td.transformation_functions["col_0"] = ( - TransformationFunctionEngine.populate_builtin_fn_arguments( - "col_0", - td.transformation_functions["col_0"], - [ - FeatureDescriptiveStatistics( - feature_name="col_0", feature_type="Integral", min=0, max=2 - ) - ], + tf_fun_source = ( + "import pandas as pd\nfrom hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics\n" + "from hsfs.hopsworks_udf import hopsworks_udf\n" + "@hopsworks_udf(float)\ndef min_max_scaler(feature : pd.Series, statistics_feature : FeatureDescriptiveStatistics) -> pd.Series:\n" + " return (feature - statistics_feature.min)/(statistics_feature.max-statistics_feature.min)\n" + ) + udf_response = { + "sourceCode": tf_fun_source, + "outputTypes": "double", + "transformationFeatures": "", + "name": "min_max_scaler", + } + + tf_fun = HopsworksUdf.from_response_json(udf_response) + + td = self._create_training_dataset() + + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=tf_fun("col_0"), featurestore_id=99 ) - ) + ] + + transformation_functions[0].hopsworks_udf.transformation_statistics = [ + FeatureDescriptiveStatistics(feature_name="col_0", min=1, max=2) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine(td, df, expected_df, transformation_functions) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_builtin_labelencoder(self, mocker): + def test_apply_builtin_minmax(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") + mocker.patch("hsfs.core.statistics_engine.StatisticsEngine._save_statistics") spark_engine = spark.Engine() schema = StructType( @@ -204,53 +198,47 @@ def test_apply_builtin_labelencoder(self, mocker): expected_schema = StructType( [ - StructField("col_0", IntegerType(), True), - StructField("col_1", IntegerType(), True), + StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("min_max_scaler_col_0_", DoubleType(), True), ] ) expected_df = pd.DataFrame( data={ - "col_0": [1, 2], - "col_1": [0, 1], + "col_1": ["test_1", "test_2"], "col_2": [True, False], + "min_max_scaler_col_0_": [0.0, 1.0], } ) expected_spark_df = spark_engine._spark_session.createDataFrame( expected_df, schema=expected_schema ) - expected_df["col_1"] = expected_df["col_1"].astype(pd.Int32Dtype()) # Arrange - tf_fun = ( - '{"module_imports": "", "transformer_code": "# label encoder\\n' - "def label_encoder(value, value_to_index):\\n" - " # define a mapping of values to integers\\n" - ' return value_to_index[value]"}' - ) + from hsfs.builtin_transformations import min_max_scaler - td = self._create_training_dataset(tf_fun, "INT", "label_encoder", "col_1") + td = self._create_training_dataset() - td.transformation_functions["col_1"] = ( - TransformationFunctionEngine.populate_builtin_fn_arguments( - "col_1", - td.transformation_functions["col_1"], - [ - FeatureDescriptiveStatistics( - feature_name="col_1", - extended_statistics={"unique_values": ["test_1", "test_2"]}, - ) - ], + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=min_max_scaler("col_0"), featurestore_id=99 ) - ) + ] + + transformation_functions[0].hopsworks_udf.transformation_statistics = [ + FeatureDescriptiveStatistics(feature_name="col_0", min=1, max=2) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine(td, df, expected_df, transformation_functions) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_builtin_standard_scaler(self, mocker): + def test_apply_builtin_standard_scaler_from_backend(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") + mocker.patch("hsfs.core.statistics_engine.StatisticsEngine._save_statistics") spark_engine = spark.Engine() schema = StructType( @@ -271,16 +259,16 @@ def test_apply_builtin_standard_scaler(self, mocker): expected_schema = StructType( [ - StructField("col_0", DoubleType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("standard_scaler_col_0_", DoubleType(), True), ] ) expected_df = pd.DataFrame( data={ - "col_0": [-1.0, 1.0], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "standard_scaler_col_0_": [-1.0, 1.0], } ) expected_spark_df = spark_engine._spark_session.createDataFrame( @@ -288,39 +276,44 @@ def test_apply_builtin_standard_scaler(self, mocker): ) # Arrange - tf_fun = ( - '{"module_imports": "from datetime import datetime", "transformer_code": "' - "def standard_scaler(value, mean, std_dev):\\n if value is None:\\n return None\\n " - "else:\\n try:\\n return (value - mean) / std_dev\\n except " - 'ZeroDivisionError:\\n return 0\\n"}' - ) - - td = self._create_training_dataset(tf_fun, "DOUBLE", "standard_scaler") - + tf_fun_source = ( + "import pandas as pd\nfrom hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics\n" + "from hsfs.hopsworks_udf import hopsworks_udf\n" + "@hopsworks_udf(float)\ndef standard_scaler(feature : pd.Series, statistics_feature : FeatureDescriptiveStatistics) -> pd.Series:\n" + " return (feature - statistics_feature.mean)/statistics_feature.stddev\n" + ) + udf_response = { + "sourceCode": tf_fun_source, + "outputTypes": "double", + "transformationFeatures": "", + "name": "standard_scaler", + } + + tf_fun = HopsworksUdf.from_response_json(udf_response) + + td = self._create_training_dataset() + + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=tf_fun("col_0"), featurestore_id=99 + ) + ] mean = statistics.mean([1, 2]) stddev = statistics.pstdev([1, 2]) - td.transformation_functions["col_0"] = ( - TransformationFunctionEngine.populate_builtin_fn_arguments( - "col_0", - td.transformation_functions["col_0"], - [ - FeatureDescriptiveStatistics( - feature_name="col_0", - feature_type="Integral", - mean=mean, - stddev=stddev, - ) - ], - ) - ) + transformation_functions[0].hopsworks_udf.transformation_statistics = [ + FeatureDescriptiveStatistics(feature_name="col_0", mean=mean, stddev=stddev) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine(td, df, expected_df, transformation_functions) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_builtin_robustscaler(self, mocker): + def test_apply_builtin_standard_scaler(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") + mocker.patch("hsfs.core.statistics_engine.StatisticsEngine._save_statistics") spark_engine = spark.Engine() schema = StructType( @@ -341,16 +334,16 @@ def test_apply_builtin_robustscaler(self, mocker): expected_schema = StructType( [ - StructField("col_0", DoubleType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("standard_scaler_col_0_", DoubleType(), True), ] ) expected_df = pd.DataFrame( data={ - "col_0": [-1.0, 0.0], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "standard_scaler_col_0_": [-1.0, 1.0], } ) expected_spark_df = spark_engine._spark_session.createDataFrame( @@ -358,40 +351,32 @@ def test_apply_builtin_robustscaler(self, mocker): ) # Arrange - tf_fun = ( - '{"module_imports": "from datetime import datetime", "transformer_code": "' - "def robust_scaler(value, p25, p50, p75):\\n if value is None:\\n " - "return None\\n else:\\n try:\\n return (value - p50) / (p75 - p25)\\n " - 'except ZeroDivisionError:\\n return 0\\n"}\n' - ) + from hsfs.builtin_transformations import standard_scaler - td = self._create_training_dataset(tf_fun, "DOUBLE", "robust_scaler") + td = self._create_training_dataset() - percentiles = [1] * 100 - percentiles[24] = 1 - percentiles[49] = 2 - percentiles[74] = 2 - td.transformation_functions["col_0"] = ( - TransformationFunctionEngine.populate_builtin_fn_arguments( - "col_0", - td.transformation_functions["col_0"], - [ - FeatureDescriptiveStatistics( - feature_name="col_0", - feature_type="Integral", - percentiles=percentiles, - ) - ], + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=standard_scaler("col_0"), featurestore_id=99 ) - ) + ] + + mean = statistics.mean([1, 2]) + stddev = statistics.pstdev([1, 2]) + transformation_functions[0].hopsworks_udf.transformation_statistics = [ + FeatureDescriptiveStatistics(feature_name="col_0", mean=mean, stddev=stddev) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine(td, df, expected_df, transformation_functions) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_plus_one_int(self, mocker): + def test_apply_builtin_robust_scaler_from_backend(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") + mocker.patch("hsfs.core.statistics_engine.StatisticsEngine._save_statistics") spark_engine = spark.Engine() schema = StructType( @@ -412,36 +397,63 @@ def test_apply_plus_one_int(self, mocker): expected_schema = StructType( [ - StructField("col_0", IntegerType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("robust_scaler_col_0_", DoubleType(), True), ] ) expected_df = pd.DataFrame( data={ - "col_0": [2, 3], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "robust_scaler_col_0_": [-1.0, 0], } ) expected_spark_df = spark_engine._spark_session.createDataFrame( expected_df, schema=expected_schema ) - expected_df["col_0"] = expected_df["col_0"].astype(pd.Int32Dtype()) # Arrange - def tf_fun(a) -> int: - return a + 1 - - td = self._create_training_dataset(tf_fun, "int") + tf_fun_source = ( + "import pandas as pd\nfrom hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics\n" + "from hsfs.hopsworks_udf import hopsworks_udf\n" + "@hopsworks_udf(float)\ndef robust_scaler(feature : pd.Series, statistics_feature : FeatureDescriptiveStatistics) -> pd.Series:\n" + " return (feature - statistics_feature.percentiles[49])/(statistics_feature.percentiles[74]-statistics_feature.percentiles[24])\n" + ) + udf_response = { + "sourceCode": tf_fun_source, + "outputTypes": "double", + "transformationFeatures": "", + "name": "robust_scaler", + } + + tf_fun = HopsworksUdf.from_response_json(udf_response) + + td = self._create_training_dataset() + + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=tf_fun("col_0"), featurestore_id=99 + ) + ] + percentiles = [1] * 100 + percentiles[24] = 1 + percentiles[49] = 2 + percentiles[74] = 2 + transformation_functions[0].hopsworks_udf.transformation_statistics = [ + FeatureDescriptiveStatistics(feature_name="col_0", percentiles=percentiles) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine(td, df, expected_df, transformation_functions) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_plus_one_str(self, mocker): + def test_apply_builtin_robust_scaler(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") + mocker.patch("hsfs.core.statistics_engine.StatisticsEngine._save_statistics") spark_engine = spark.Engine() schema = StructType( @@ -462,16 +474,16 @@ def test_apply_plus_one_str(self, mocker): expected_schema = StructType( [ - StructField("col_0", StringType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("robust_scaler_col_0_", DoubleType(), True), ] ) expected_df = pd.DataFrame( data={ - "col_0": ["2", "3"], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "robust_scaler_col_0_": [-1.0, 0], } ) expected_spark_df = spark_engine._spark_session.createDataFrame( @@ -479,16 +491,31 @@ def test_apply_plus_one_str(self, mocker): ) # Arrange - def tf_fun(a) -> int: - return a + 1 + from hsfs.builtin_transformations import robust_scaler + + td = self._create_training_dataset() - td = self._create_training_dataset(tf_fun, "string") + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=robust_scaler("col_0"), featurestore_id=99 + ) + ] + + percentiles = [1] * 100 + percentiles[24] = 1 + percentiles[49] = 2 + percentiles[74] = 2 + transformation_functions[0].hopsworks_udf.transformation_statistics = [ + FeatureDescriptiveStatistics(feature_name="col_0", percentiles=percentiles) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine(td, df, expected_df, transformation_functions) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_plus_one_double(self, mocker): + def test_apply_plus_one_int(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") spark_engine = spark.Engine() @@ -507,96 +534,103 @@ def test_apply_plus_one_double(self, mocker): "col_2": [True, False], } ) + spark_df = spark_engine._spark_session.createDataFrame(df, schema=schema) expected_schema = StructType( [ - StructField("col_0", DoubleType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("tf_fun_col_0_", LongType(), True), ] ) expected_df = pd.DataFrame( data={ - "col_0": [2.0, 3.0], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "tf_fun_col_0_": [2, 3], } ) expected_spark_df = spark_engine._spark_session.createDataFrame( expected_df, schema=expected_schema ) - spark_df = spark_engine._spark_session.createDataFrame(df, schema=schema) # Arrange - def tf_fun(a) -> np.float64: - return a + 1.0 + @hopsworks_udf(int) + def tf_fun(col_0): + return col_0 + 1 + + td = self._create_training_dataset() - td = self._create_training_dataset(tf_fun, "double") + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=tf_fun, featurestore_id=99 + ) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine(td, df, expected_df, transformation_functions) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_plus_one_datetime_no_tz(self, mocker): + def test_apply_plus_one_str(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") spark_engine = spark.Engine() schema = StructType( [ - StructField("col_0", IntegerType(), True), + StructField("col_0", StringType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), ] ) df = pd.DataFrame( data={ - "col_0": [1640995200, 1640995201], + "col_0": ["1", "2"], "col_1": ["test_1", "test_2"], "col_2": [True, False], } ) - spark_df = spark_engine._spark_session.createDataFrame(df, schema=schema) expected_schema = StructType( [ - StructField("col_0", TimestampType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("tf_fun_col_0_", StringType(), True), ] ) expected_df = pd.DataFrame( data={ - "col_0": [ - datetime.datetime.utcfromtimestamp(1640995201), - datetime.datetime.utcfromtimestamp(1640995202), - ], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "tf_fun_col_0_": ["11", "21"], } ) - # convert timestamps to current timezone - local_tz = tzlocal.get_localzone() - expected_df_localized = expected_df.copy(True) - expected_df_localized["col_0"] = expected_df_localized["col_0"].dt.tz_localize( - str(local_tz) - ) expected_spark_df = spark_engine._spark_session.createDataFrame( - expected_df_localized, schema=expected_schema + expected_df, schema=expected_schema ) # Arrange - def tf_fun(a) -> datetime.datetime: - return datetime.datetime.utcfromtimestamp(a + 1) - - td = self._create_training_dataset(tf_fun, "datetime") + @hopsworks_udf(str) + def tf_fun(col_0): + return col_0 + "1" + + td = self._create_training_dataset() + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=tf_fun, featurestore_id=99 + ) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine(td, df, expected_df, transformation_functions) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_plus_one_datetime_tz_utc(self, mocker): + def test_apply_plus_one_double(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") spark_engine = spark.Engine() @@ -610,127 +644,143 @@ def test_apply_plus_one_datetime_tz_utc(self, mocker): ) df = pd.DataFrame( data={ - "col_0": [1640995200, 1640995201], + "col_0": [1, 2], "col_1": ["test_1", "test_2"], "col_2": [True, False], } ) - spark_df = spark_engine._spark_session.createDataFrame(df, schema=schema) expected_schema = StructType( [ - StructField("col_0", TimestampType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("tf_fun_col_0_", DoubleType(), True), ] ) expected_df = pd.DataFrame( data={ - "col_0": [ - datetime.datetime.utcfromtimestamp(1640995201), - datetime.datetime.utcfromtimestamp(1640995202), - ], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "tf_fun_col_0_": [2.0, 3.0], } ) - # convert timestamps to current timezone - local_tz = tzlocal.get_localzone() - expected_df_localized = expected_df.copy(True) - expected_df_localized["col_0"] = expected_df_localized["col_0"].dt.tz_localize( - str(local_tz) - ) expected_spark_df = spark_engine._spark_session.createDataFrame( - expected_df_localized, schema=expected_schema + expected_df, schema=expected_schema ) + spark_df = spark_engine._spark_session.createDataFrame(df, schema=schema) # Arrange - def tf_fun(a) -> datetime.datetime: - return datetime.datetime.utcfromtimestamp(a + 1).replace( - tzinfo=datetime.timezone.utc + @hopsworks_udf(float) + def tf_fun(col_0): + return col_0 + 1.0 + + td = self._create_training_dataset() + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=tf_fun, featurestore_id=99 ) - - td = self._create_training_dataset(tf_fun, "datetime") + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine(td, df, expected_df, transformation_functions) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_plus_one_datetime_tz_pst(self, mocker): + def test_apply_plus_one_datetime_no_tz(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") spark_engine = spark.Engine() schema = StructType( [ - StructField("col_0", IntegerType(), True), + StructField("col_0", TimestampType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), ] ) df = pd.DataFrame( data={ - "col_0": [1640995200, 1640995201], + "col_0": [ + datetime.datetime.utcfromtimestamp(1640995200), + datetime.datetime.utcfromtimestamp(1640995201), + ], "col_1": ["test_1", "test_2"], "col_2": [True, False], } ) + spark_df = spark_engine._spark_session.createDataFrame(df, schema=schema) expected_schema = StructType( [ - StructField("col_0", TimestampType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("tf_fun_col_0_", TimestampType(), True), ] ) - expected_df = pd.DataFrame( data={ - "col_0": [ - datetime.datetime.utcfromtimestamp(1641024001), - datetime.datetime.utcfromtimestamp(1641024002), - ], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "tf_fun_col_0_": [ + datetime.datetime.utcfromtimestamp(1640995200) + + datetime.timedelta(milliseconds=1), + datetime.datetime.utcfromtimestamp(1640995201) + + datetime.timedelta(milliseconds=1), + ], } ) # convert timestamps to current timezone local_tz = tzlocal.get_localzone() expected_df_localized = expected_df.copy(True) - expected_df_localized["col_0"] = expected_df_localized["col_0"].dt.tz_localize( - str(local_tz) - ) + expected_df_localized["tf_fun_col_0_"] = expected_df_localized[ + "tf_fun_col_0_" + ].dt.tz_localize(str(local_tz)) expected_spark_df = spark_engine._spark_session.createDataFrame( expected_df_localized, schema=expected_schema ) # Arrange - def tf_fun(a) -> datetime.datetime: - pdt = pytz.timezone("US/Pacific") - return pdt.localize(datetime.datetime.utcfromtimestamp(a + 1)) + @hopsworks_udf(datetime.datetime) + def tf_fun(col_0): + import datetime + + return col_0 + datetime.timedelta(milliseconds=1) - td = self._create_training_dataset(tf_fun, "datetime") + td = self._create_training_dataset() + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=tf_fun, featurestore_id=99 + ) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine( + td, df, expected_df_localized, transformation_functions + ) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_plus_one_datetime_ts_none(self, mocker): + def test_apply_plus_one_datetime_tz_utc(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") spark_engine = spark.Engine() schema = StructType( [ - StructField("col_0", IntegerType(), True), + StructField("col_0", TimestampType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), ] ) df = pd.DataFrame( data={ - "col_0": [1640995200, 1640995201], + "col_0": [ + datetime.datetime.utcfromtimestamp(1640995200), + datetime.datetime.utcfromtimestamp(1640995201), + ], "col_1": ["test_1", "test_2"], "col_2": [True, False], } @@ -739,59 +789,75 @@ def test_apply_plus_one_datetime_ts_none(self, mocker): expected_schema = StructType( [ - StructField("col_0", TimestampType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("tf_fun_col_0_", TimestampType(), True), ] ) - expected_df = pd.DataFrame( data={ - "col_0": [ - None, - datetime.datetime.utcfromtimestamp(1640995202), - ], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "tf_fun_col_0_": [ + datetime.datetime.utcfromtimestamp(1640995200) + + datetime.timedelta(milliseconds=1), + datetime.datetime.utcfromtimestamp(1640995201) + + datetime.timedelta(milliseconds=1), + ], } ) # convert timestamps to current timezone local_tz = tzlocal.get_localzone() expected_df_localized = expected_df.copy(True) - expected_df_localized["col_0"] = expected_df_localized["col_0"].dt.tz_localize( - str(local_tz) - ) + expected_df_localized["tf_fun_col_0_"] = expected_df_localized[ + "tf_fun_col_0_" + ].dt.tz_localize(str(local_tz)) expected_spark_df = spark_engine._spark_session.createDataFrame( expected_df_localized, schema=expected_schema ) # Arrange - def tf_fun(a) -> datetime.datetime: - return ( - None if a == 1640995200 else datetime.datetime.utcfromtimestamp(a + 1) + @hopsworks_udf(datetime.datetime) + def tf_fun(col_0) -> datetime.datetime: + import datetime + + return (col_0 + datetime.timedelta(milliseconds=1)).dt.tz_localize( + datetime.timezone.utc ) - td = self._create_training_dataset(tf_fun, "datetime") + td = self._create_training_dataset() + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=tf_fun, featurestore_id=99 + ) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine( + td, df, expected_df_localized, transformation_functions + ) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_plus_one_date(self, mocker): + def test_apply_plus_one_datetime_tz_pst(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") spark_engine = spark.Engine() schema = StructType( [ - StructField("col_0", IntegerType(), True), + StructField("col_0", TimestampType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), ] ) df = pd.DataFrame( data={ - "col_0": [1641045600, 1641132000], + "col_0": [ + datetime.datetime.utcfromtimestamp(1640995200), + datetime.datetime.utcfromtimestamp(1640995201), + ], "col_1": ["test_1", "test_2"], "col_2": [True, False], } @@ -800,50 +866,77 @@ def test_apply_plus_one_date(self, mocker): expected_schema = StructType( [ - StructField("col_0", DateType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("tf_fun_col_0_", TimestampType(), True), ] ) + expected_df = pd.DataFrame( data={ - "col_0": [ - datetime.datetime.utcfromtimestamp(1641045601).date(), - datetime.datetime.utcfromtimestamp(1641132001).date(), - ], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "tf_fun_col_0_": [ + datetime.datetime.utcfromtimestamp(1640995200) + + datetime.timedelta(milliseconds=1), + datetime.datetime.utcfromtimestamp(1640995201) + + datetime.timedelta(milliseconds=1), + ], } ) + # convert timestamps to current timezone + local_tz = tzlocal.get_localzone() + expected_df_localized = expected_df.copy(True) + expected_df_localized["tf_fun_col_0_"] = expected_df_localized[ + "tf_fun_col_0_" + ].dt.tz_localize(str(local_tz)) expected_spark_df = spark_engine._spark_session.createDataFrame( - expected_df, schema=expected_schema + expected_df_localized, schema=expected_schema ) # Arrange - def tf_fun(a) -> datetime.datetime: - return datetime.datetime.utcfromtimestamp(a + 1) + @hopsworks_udf(datetime.datetime) + def tf_fun(col_0) -> datetime.datetime: + import datetime + + import pytz - td = self._create_training_dataset(tf_fun, "date") + pdt = pytz.timezone("US/Pacific") + return (col_0 + datetime.timedelta(milliseconds=1)).dt.tz_localize(pdt) + + td = self._create_training_dataset() + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=tf_fun, featurestore_id=99 + ) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine( + td, df, expected_df_localized, transformation_functions + ) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_plus_one_no_type(self, mocker): + def test_apply_plus_one_datetime_ts_none(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") spark_engine = spark.Engine() schema = StructType( [ - StructField("col_0", IntegerType(), True), + StructField("col_0", TimestampType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), ] ) df = pd.DataFrame( data={ - "col_0": [1, 2], + "col_0": [ + datetime.datetime.utcfromtimestamp(1640995200), + datetime.datetime.utcfromtimestamp(1640995201), + ], "col_1": ["test_1", "test_2"], "col_2": [True, False], } @@ -852,47 +945,78 @@ def test_apply_plus_one_no_type(self, mocker): expected_schema = StructType( [ - StructField("col_0", StringType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("tf_fun_col_0_", TimestampType(), True), ] ) + expected_df = pd.DataFrame( data={ - "col_0": ["2", "3"], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "tf_fun_col_0_": [ + None, + datetime.datetime.utcfromtimestamp(1640995201) + + datetime.timedelta(milliseconds=1), + ], } ) + # convert timestamps to current timezone + local_tz = tzlocal.get_localzone() + expected_df_localized = expected_df.copy(True) + expected_df_localized["tf_fun_col_0_"] = expected_df_localized[ + "tf_fun_col_0_" + ].dt.tz_localize(str(local_tz)) expected_spark_df = spark_engine._spark_session.createDataFrame( - expected_df, schema=expected_schema + expected_df_localized, schema=expected_schema ) # Arrange - def tf_fun(a) -> int: - return a + 1 + @hopsworks_udf(datetime.datetime) + def tf_fun(col_0) -> datetime.datetime: + import datetime + + return pd.Series( + None + if data == datetime.datetime.utcfromtimestamp(1640995200) + else data + datetime.timedelta(milliseconds=1) + for data in col_0 + ) - td = self._create_training_dataset(tf_fun) + td = self._create_training_dataset() + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=tf_fun, featurestore_id=99 + ) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine( + td, df, expected_df_localized, transformation_functions + ) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_plus_one_empty_type(self, mocker): + def test_apply_plus_one_date(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") spark_engine = spark.Engine() schema = StructType( [ - StructField("col_0", IntegerType(), True), + StructField("col_0", DateType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), ] ) df = pd.DataFrame( data={ - "col_0": [1, 2], + "col_0": [ + datetime.datetime.utcfromtimestamp(1641045600).date(), + datetime.datetime.utcfromtimestamp(1641132000).date(), + ], "col_1": ["test_1", "test_2"], "col_2": [True, False], } @@ -901,16 +1025,21 @@ def test_apply_plus_one_empty_type(self, mocker): expected_schema = StructType( [ - StructField("col_0", StringType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("tf_fun_col_0_", DateType(), True), ] ) expected_df = pd.DataFrame( data={ - "col_0": ["2", "3"], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "tf_fun_col_0_": [ + datetime.datetime.utcfromtimestamp(1641045600).date() + + datetime.timedelta(days=1), + datetime.datetime.utcfromtimestamp(1641132000).date() + + datetime.timedelta(days=1), + ], } ) expected_spark_df = spark_engine._spark_session.createDataFrame( @@ -918,26 +1047,37 @@ def test_apply_plus_one_empty_type(self, mocker): ) # Arrange - def tf_fun(a) -> int: - return a + 1 + @hopsworks_udf(datetime.date) + def tf_fun(col_0): + import datetime - td = self._create_training_dataset(tf_fun, "") + return col_0 + datetime.timedelta(days=1) + + td = self._create_training_dataset() + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=tf_fun, featurestore_id=99 + ) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine(td, df, expected_df, transformation_functions) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_plus_one_date_not_supported_type(self, mocker): + def test_apply_plus_one_invalid_type(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") # Arrange - def tf_fun(a) -> int: - return a + 1 + with pytest.raises(FeatureStoreException) as e_info: - # Act - with pytest.raises(TypeError) as e_info: - self._create_training_dataset(tf_fun, list) + @hopsworks_udf(list) + def tf_fun(a): + return a + 1 - # Assert - assert str(e_info.value) == "Not supported type ." + assert ( + str(e_info.value) + == f"Output type {list} is not supported. Please refer to the documentation to get more information on the supported types." + ) diff --git a/python/tests/fixtures/feature_view_fixtures.json b/python/tests/fixtures/feature_view_fixtures.json index 92601b46da..e515e0d0df 100644 --- a/python/tests/fixtures/feature_view_fixtures.json +++ b/python/tests/fixtures/feature_view_fixtures.json @@ -684,9 +684,30 @@ "id": 11, "version": 1, "description": "test_description", - "transformation_functions": { - "featurestore_id": 5 - }, + "transformation_functions": [ + { + "id" : 1, + "version": 2, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n return data1 + statistics_data1.mean\n", + "name": "add_mean_fs", + "outputTypes":"double", + "transformationFeatures":"data" + } + }, + { + "id" : 2, + "version": 1, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", + "name": "add_one_fs", + "outputTypes":"double", + "transformationFeatures":"col1" + } + } + ], "features": [ { "name": "intt", diff --git a/python/tests/fixtures/transformation_function_fixtures.json b/python/tests/fixtures/transformation_function_fixtures.json index 5b8e753508..169d779bd6 100644 --- a/python/tests/fixtures/transformation_function_fixtures.json +++ b/python/tests/fixtures/transformation_function_fixtures.json @@ -80,6 +80,24 @@ ] } }, + "get_list_one_argument": { + "response": { + "count": 1, + "items": [ + { + "id" : 1, + "version": 2, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n return data1 + statistics_data1.mean\n", + "name": "add_mean_fs", + "outputTypes":"double", + "transformationFeatures":"data" + } + } + ] + } + }, "get_list_empty": { "response": { "count": 0, diff --git a/python/tests/test_transformation_function.py b/python/tests/test_transformation_function.py index 5fdea2987f..b54fbdbe6b 100644 --- a/python/tests/test_transformation_function.py +++ b/python/tests/test_transformation_function.py @@ -171,6 +171,30 @@ def test_from_response_json_list(self, backend_fixtures): == "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n" ) + def test_from_response_json_list_one_argument(self, backend_fixtures): + # Arrange + json = backend_fixtures["transformation_function"]["get_list_one_argument"][ + "response" + ] + + # Act + tf = TransformationFunction.from_response_json(json) + + # Assert + assert not isinstance(tf, list) + assert tf.id == 1 + assert tf._featurestore_id == 11 + assert tf.version == 2 + assert tf.hopsworks_udf.function_name == "add_mean_fs" + assert tf.hopsworks_udf.output_types == ["double"] + assert tf.hopsworks_udf.statistics_required + assert tf.hopsworks_udf.transformation_features == ["data"] + assert tf.hopsworks_udf.statistics_features == ["data"] + assert ( + tf.hopsworks_udf._function_source + == "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n return data1 + statistics_data1.mean\n" + ) + def test_transformation_function_definition_no_hopworks_udf(self): def test(col1): return col1 + 1 From 3fc94f883ced13eb223ad079252b7460630a0ac1 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Mon, 13 May 2024 13:30:53 +0200 Subject: [PATCH 20/58] removed print --- python/hsfs/hopsworks_udf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index 9ed60ead0d..912c7e1456 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -432,7 +432,6 @@ def renaming_wrapper(*args): df = convert_timezone(df) return df""" ) - print(code) # injecting variables into scope used to execute wrapper function. scope = __import__("__main__").__dict__ if self.transformation_statistics is not None: From 2bf5f2059ff51992d60fc96f731b8b9a8619e8fe Mon Sep 17 00:00:00 2001 From: manu-sj Date: Mon, 13 May 2024 14:06:47 +0200 Subject: [PATCH 21/58] adding test for hopsworks_udf --- python/hsfs/hopsworks_udf.py | 3 +- python/pyproject.toml | 3 +- python/tests/pyproject.toml | 4 +- python/tests/test_helpers/__init__.py | 0 .../transformation_test_helper.py | 92 ++++ python/tests/test_hopswork_udf.py | 503 ++++++++++++++++++ 6 files changed, 599 insertions(+), 6 deletions(-) create mode 100644 python/tests/test_helpers/__init__.py create mode 100644 python/tests/test_helpers/transformation_test_helper.py create mode 100644 python/tests/test_hopswork_udf.py diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index 912c7e1456..9b3b332812 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -278,6 +278,7 @@ def _parse_function_signature(source_code: str) -> Tuple[List[str], str, int, in ] ) arg_list = signature.split("(")[1].split(")")[0].split(",") + arg_list = [arg for arg in arg_list if not arg.strip() == ""] return arg_list, signature, signature_start_line, signature_end_line @staticmethod @@ -293,7 +294,7 @@ def _extract_function_arguments(source_code: str) -> List[TransformationFeature] # Get source code of the original function arg_list, _, _, _ = HopsworksUdf._parse_function_signature(source_code) - if arg_list == [""]: + if arg_list == []: raise FeatureStoreException( "No arguments present in the provided user defined function. Please provide at least one argument in the defined user defined function." ) diff --git a/python/pyproject.toml b/python/pyproject.toml index 1ad6c8c5f4..77fe01a61f 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -142,8 +142,7 @@ exclude = [ "node_modules", "site-packages", "venv", - "java", - "python/tests/transformations_test_helper/" # transformations_test_helper excluded from fomating and linting because the used formating is required for the test cases + "java" ] # Same as Black. diff --git a/python/tests/pyproject.toml b/python/tests/pyproject.toml index 3d36a4588e..050735f853 100644 --- a/python/tests/pyproject.toml +++ b/python/tests/pyproject.toml @@ -8,8 +8,6 @@ ignore = [ # Allow fix for all enabled rules (when `--fix`) is provided. fixable = ["ALL"] unfixable = [] -# transformations_test_helper excluded from fomating and linting because the used formating is required for the test cases -exclude = ["transformations_test_helper/"] # Allow unused variables when underscore-prefixed. dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" @@ -36,4 +34,4 @@ line-ending = "auto" pythonpath = [ ".", "tests" ] -addopts = "--ignore=python/tests/transformations_test_helper/" +addopts = "--ignore=python/tests/test_helper/" diff --git a/python/tests/test_helpers/__init__.py b/python/tests/test_helpers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/tests/test_helpers/transformation_test_helper.py b/python/tests/test_helpers/transformation_test_helper.py new file mode 100644 index 0000000000..8b81c48fde --- /dev/null +++ b/python/tests/test_helpers/transformation_test_helper.py @@ -0,0 +1,92 @@ +import pandas as pd +from hsfs.statistics import FeatureDescriptiveStatistics + + +def test_function(): + return True + + +def test_function_one_argument(arg1): + pass + + +def test_function_one_argument_with_statistics(arg1, statistics_arg1): + pass + + +def test_function_one_argument_with_typehints(arg1: pd.Series): + pass + + +def test_function_one_argument_with_statistics_and_typehints( + arg1: pd.Series, statistics_arg1: FeatureDescriptiveStatistics +): + pass + + +def test_function_multiple_argument(arg1, arg2): + pass + + +def test_function_multiple_argument_with_statistics( + arg1, arg2, arg3, statistics_arg1, statistics_arg3 +): + pass + + +def test_function_multiple_argument_with_typehints(arg1: pd.Series, arg2: pd.Series): + pass + + +def test_function_multiple_argument_with_statistics_and_typehints( + arg1: pd.Series, + arg2: pd.Series, + statistics_arg1: FeatureDescriptiveStatistics, + statistics_arg2: FeatureDescriptiveStatistics, +): + pass + + +def test_function_multiple_argument_with_mixed_statistics_and_typehints( + arg1: pd.Series, + arg2, + arg3, + statistics_arg1, + statistics_arg3: FeatureDescriptiveStatistics, +): + pass + + +def test_function_multiple_argument_all_parameter_with_spaces( + arg1: pd.Series, + arg2, + statistics_arg1, + statistics_arg2: FeatureDescriptiveStatistics, +): + pass + + +def test_function_multiple_argument_all_parameter_multiline( + arg1: pd.Series, + arg2, + statistics_arg1, + arg3, + statistics_arg3: FeatureDescriptiveStatistics, +): + pass + + +def test_function_multiple_argument_all_parameter_multiline_with_comments( + arg1: pd.Series, # Test Comment + arg2, + statistics_arg1, # Test Comment + arg3, + statistics_arg3: FeatureDescriptiveStatistics, +) -> pd.DataFrame: # Test Comment + pass + + +def test_function_statistics_invalid( + arg1: pd.Series, statistics_arg3: FeatureDescriptiveStatistics +): + pass diff --git a/python/tests/test_hopswork_udf.py b/python/tests/test_hopswork_udf.py new file mode 100644 index 0000000000..04dab45309 --- /dev/null +++ b/python/tests/test_hopswork_udf.py @@ -0,0 +1,503 @@ +# +# Copyright 2024 Hopsworks AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from datetime import date, datetime, time + +import pandas as pd +import pytest +from hsfs.client.exceptions import FeatureStoreException +from hsfs.hopsworks_udf import HopsworksUdf, TransformationFeature, hopsworks_udf + + +class TestHopsworksUdf: + def test_validate_and_convert_output_types_one_elements(self): + assert HopsworksUdf._validate_and_convert_output_types([int]) == ["bigint"] + + assert HopsworksUdf._validate_and_convert_output_types([float]) == ["double"] + + assert HopsworksUdf._validate_and_convert_output_types([str]) == ["string"] + + assert HopsworksUdf._validate_and_convert_output_types([bool]) == ["boolean"] + + assert HopsworksUdf._validate_and_convert_output_types([datetime]) == [ + "timestamp" + ] + + assert HopsworksUdf._validate_and_convert_output_types([time]) == ["timestamp"] + + assert HopsworksUdf._validate_and_convert_output_types([date]) == ["date"] + + with pytest.raises(FeatureStoreException) as exception: + HopsworksUdf._validate_and_convert_output_types([pd.DatetimeTZDtype]) + + assert ( + str(exception.value) + == f"Output type {pd.DatetimeTZDtype} is not supported. Please refer to the documentation to get more information on the supported types." + ) + + def test_validate_and_convert_output_types_multiple_types(self): + assert HopsworksUdf._validate_and_convert_output_types( + [int, float, str, bool, datetime, date, time] + ) == ["bigint", "double", "string", "boolean", "timestamp", "date", "timestamp"] + + assert HopsworksUdf._validate_and_convert_output_types( + ["bigint", "double", "string", "boolean", "timestamp", "date"] + ) == ["bigint", "double", "string", "boolean", "timestamp", "date"] + + with pytest.raises(FeatureStoreException) as exception: + HopsworksUdf._validate_and_convert_output_types([pd.DatetimeTZDtype]) + + assert ( + str(exception.value) + == f"Output type {pd.DatetimeTZDtype} is not supported. Please refer to the documentation to get more information on the supported types." + ) + + def test_validate_and_convert_output_types_invalid_types(self): + with pytest.raises(FeatureStoreException) as exception: + HopsworksUdf._validate_and_convert_output_types([pd.DatetimeTZDtype]) + + assert ( + str(exception.value) + == f"Output type {pd.DatetimeTZDtype} is not supported. Please refer to the documentation to get more information on the supported types." + ) + + with pytest.raises(FeatureStoreException) as exception: + HopsworksUdf._validate_and_convert_output_types([int, pd.DatetimeTZDtype]) + + assert ( + str(exception.value) + == f"Output type {pd.DatetimeTZDtype} is not supported. Please refer to the documentation to get more information on the supported types." + ) + + with pytest.raises(FeatureStoreException) as exception: + HopsworksUdf._validate_and_convert_output_types([int, "pd.DatetimeTZDtype"]) + + assert ( + str(exception.value) + == "Output type pd.DatetimeTZDtype is not supported. Please refer to the documentation to get more information on the supported types." + ) + + def test_get_module_imports(self): + assert HopsworksUdf._get_module_imports( + "python/tests/test_helpers/transformation_test_helper.py" + ) == [ + "import pandas as pd", + "from hsfs.statistics import FeatureDescriptiveStatistics", + ] + + def test_extract_source_code(self): + from test_helpers.transformation_test_helper import test_function + + assert """import pandas as pd +from hsfs.statistics import FeatureDescriptiveStatistics +def test_function(): + return True""" == HopsworksUdf._extract_source_code(test_function).strip() + + def test_extract_function_arguments_no_arguments(self): + from test_helpers.transformation_test_helper import test_function + + with pytest.raises(FeatureStoreException) as exception: + function_source = HopsworksUdf._extract_source_code(test_function) + HopsworksUdf._extract_function_arguments(function_source) + + assert ( + str(exception.value) + == "No arguments present in the provided user defined function. Please provide at least one argument in the defined user defined function." + ) + + def test_extract_function_arguments_one_argument(self): + from test_helpers.transformation_test_helper import test_function_one_argument + + function_source = HopsworksUdf._extract_source_code(test_function_one_argument) + function_argument = HopsworksUdf._extract_function_arguments(function_source) + + assert function_argument == [ + TransformationFeature(feature_name="arg1", statistic_argument_name=None) + ] + + def test_extract_function_arguments_one_argument_with_statistics(self): + from test_helpers.transformation_test_helper import ( + test_function_one_argument_with_statistics, + ) + + function_source = HopsworksUdf._extract_source_code( + test_function_one_argument_with_statistics + ) + function_argument = HopsworksUdf._extract_function_arguments(function_source) + + assert function_argument == [ + TransformationFeature( + feature_name="arg1", statistic_argument_name="statistics_arg1" + ) + ] + + def test_extract_function_arguments_one_argument_with_typehint(self): + from test_helpers.transformation_test_helper import ( + test_function_one_argument_with_typehints, + ) + + function_source = HopsworksUdf._extract_source_code( + test_function_one_argument_with_typehints + ) + function_argument = HopsworksUdf._extract_function_arguments(function_source) + + assert function_argument == [ + TransformationFeature(feature_name="arg1", statistic_argument_name=None) + ] + + def test_extract_function_arguments_one_argument_with_statistics_and_typehints( + self, + ): + from test_helpers.transformation_test_helper import ( + test_function_one_argument_with_statistics_and_typehints, + ) + + function_source = HopsworksUdf._extract_source_code( + test_function_one_argument_with_statistics_and_typehints + ) + function_argument = HopsworksUdf._extract_function_arguments(function_source) + + assert function_argument == [ + TransformationFeature( + feature_name="arg1", statistic_argument_name="statistics_arg1" + ) + ] + + def test_extract_function_arguments_multiple_argument(self): + from test_helpers.transformation_test_helper import ( + test_function_multiple_argument, + ) + + function_source = HopsworksUdf._extract_source_code( + test_function_multiple_argument + ) + function_argument = HopsworksUdf._extract_function_arguments(function_source) + + assert function_argument == [ + TransformationFeature(feature_name="arg1", statistic_argument_name=None), + TransformationFeature(feature_name="arg2", statistic_argument_name=None), + ] + + def test_extract_function_arguments_multiple_argument_with_statistics(self): + from test_helpers.transformation_test_helper import ( + test_function_multiple_argument_with_statistics, + ) + + function_source = HopsworksUdf._extract_source_code( + test_function_multiple_argument_with_statistics + ) + function_argument = HopsworksUdf._extract_function_arguments(function_source) + + assert function_argument == [ + TransformationFeature( + feature_name="arg1", statistic_argument_name="statistics_arg1" + ), + TransformationFeature(feature_name="arg2", statistic_argument_name=None), + TransformationFeature( + feature_name="arg3", statistic_argument_name="statistics_arg3" + ), + ] + + def test_extract_function_arguments_multiple_argument_with_typehints(self): + from test_helpers.transformation_test_helper import ( + test_function_multiple_argument_with_typehints, + ) + + function_source = HopsworksUdf._extract_source_code( + test_function_multiple_argument_with_typehints + ) + function_argument = HopsworksUdf._extract_function_arguments(function_source) + + assert function_argument == [ + TransformationFeature(feature_name="arg1", statistic_argument_name=None), + TransformationFeature(feature_name="arg2", statistic_argument_name=None), + ] + + def test_extract_function_arguments_multiple_argument_with_statistics_and_typehints( + self, + ): + from test_helpers.transformation_test_helper import ( + test_function_multiple_argument_with_statistics_and_typehints, + ) + + function_source = HopsworksUdf._extract_source_code( + test_function_multiple_argument_with_statistics_and_typehints + ) + function_argument = HopsworksUdf._extract_function_arguments(function_source) + + assert function_argument == [ + TransformationFeature( + feature_name="arg1", statistic_argument_name="statistics_arg1" + ), + TransformationFeature( + feature_name="arg2", statistic_argument_name="statistics_arg2" + ), + ] + + def test_extract_function_arguments_multiple_argument_with_mixed_statistics_and_typehints( + self, + ): + from test_helpers.transformation_test_helper import ( + test_function_multiple_argument_with_mixed_statistics_and_typehints, + ) + + function_source = HopsworksUdf._extract_source_code( + test_function_multiple_argument_with_mixed_statistics_and_typehints + ) + function_argument = HopsworksUdf._extract_function_arguments(function_source) + + assert function_argument == [ + TransformationFeature( + feature_name="arg1", statistic_argument_name="statistics_arg1" + ), + TransformationFeature(feature_name="arg2", statistic_argument_name=None), + TransformationFeature( + feature_name="arg3", statistic_argument_name="statistics_arg3" + ), + ] + + def test_extract_function_arguments_multiple_argument_all_parameter_with_spaces( + self, + ): + from test_helpers.transformation_test_helper import ( + test_function_multiple_argument_all_parameter_with_spaces, + ) + + function_source = HopsworksUdf._extract_source_code( + test_function_multiple_argument_all_parameter_with_spaces + ) + function_argument = HopsworksUdf._extract_function_arguments(function_source) + + assert function_argument == [ + TransformationFeature( + feature_name="arg1", statistic_argument_name="statistics_arg1" + ), + TransformationFeature( + feature_name="arg2", statistic_argument_name="statistics_arg2" + ), + ] + + def test_extract_function_arguments_multiple_argument_all_parameter_multiline(self): + from test_helpers.transformation_test_helper import ( + test_function_multiple_argument_all_parameter_multiline, + ) + + function_source = HopsworksUdf._extract_source_code( + test_function_multiple_argument_all_parameter_multiline + ) + function_argument = HopsworksUdf._extract_function_arguments(function_source) + + assert function_argument == [ + TransformationFeature( + feature_name="arg1", statistic_argument_name="statistics_arg1" + ), + TransformationFeature(feature_name="arg2", statistic_argument_name=None), + TransformationFeature( + feature_name="arg3", statistic_argument_name="statistics_arg3" + ), + ] + + def test_extract_function_arguments_multiple_argumen_all_parameter_multiline_with_comments( + self, + ): + from test_helpers.transformation_test_helper import ( + test_function_multiple_argument_all_parameter_multiline_with_comments, + ) + + function_source = HopsworksUdf._extract_source_code( + test_function_multiple_argument_all_parameter_multiline_with_comments + ) + function_argument = HopsworksUdf._extract_function_arguments(function_source) + + assert function_argument == [ + TransformationFeature( + feature_name="arg1", statistic_argument_name="statistics_arg1" + ), + TransformationFeature(feature_name="arg2", statistic_argument_name=None), + TransformationFeature( + feature_name="arg3", statistic_argument_name="statistics_arg3" + ), + ] + + def test_extract_function_arguments_statistics_invalid(self): + from test_helpers.transformation_test_helper import ( + test_function_statistics_invalid, + ) + + with pytest.raises(FeatureStoreException) as exception: + function_source = HopsworksUdf._extract_source_code( + test_function_statistics_invalid + ) + HopsworksUdf._extract_function_arguments(function_source) + + assert ( + str(exception.value) + == "No argument corresponding to statistics parameter 'statistics_arg3' present in function definition." + ) + + def test_format_source_code(self): + from test_helpers.transformation_test_helper import ( + test_function_multiple_argument_all_parameter_multiline_with_comments, + ) + + function_source = HopsworksUdf._extract_source_code( + test_function_multiple_argument_all_parameter_multiline_with_comments + ) + function_argument = HopsworksUdf._extract_function_arguments(function_source) + print("\n") + print(function_argument) + formated_source, module_imports = HopsworksUdf._format_source_code( + function_source, function_argument + ) + print(formated_source) + assert ( + formated_source.strip() + == """def test_function_multiple_argument_all_parameter_multiline_with_comments(arg1, arg2, arg3): +\t pass""" + ) + + def test_generate_output_column_names_one_argument_one_output_type(self): + @hopsworks_udf(int) + def test_func(col1): + return col1 + 1 + + assert test_func._get_output_column_names() == ["test_func_col1_"] + + def test_generate_output_column_names_multiple_argument_one_output_type(self): + @hopsworks_udf(int) + def test_func(col1, col2, col3): + return col1 + 1 + + assert test_func._get_output_column_names() == ["test_func_col1-col2-col3_"] + + def test_generate_output_column_names_single_argument_multiple_output_type(self): + @hopsworks_udf([int, float, int]) + def test_func(col1): + return pd.DataFrame( + {"col1": [col1 + 1], "col2": [col1 + 1], "col3": [col1 + 1]} + ) + + assert test_func._get_output_column_names() == [ + "test_func_col1_0", + "test_func_col1_1", + "test_func_col1_2", + ] + + def test_generate_output_column_names_multiple_argument_multiple_output_type(self): + @hopsworks_udf([int, float, int]) + def test_func(col1, col2, col3): + return pd.DataFrame( + {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]} + ) + + assert test_func._get_output_column_names() == [ + "test_func_col1-col2-col3_0", + "test_func_col1-col2-col3_1", + "test_func_col1-col2-col3_2", + ] + + def test_create_pandas_udf_return_schema_from_list_one_output_type(self): + @hopsworks_udf(int) + def test_func(col1): + return col1 + 1 + + assert test_func._create_pandas_udf_return_schema_from_list() == "bigint" + + def test_create_pandas_udf_return_schema_from_list_one_argument_multiple_output_type( + self, + ): + @hopsworks_udf([int, float, str, date, datetime, time, bool]) + def test_func(col1): + return pd.DataFrame( + { + "col1": [col1 + 1], + "col2": [col1 + 1], + "col3": [col1 + 1], + "col4": [col1 + 1], + "col5": [col1 + 1], + "col6": [True], + } + ) + + assert ( + test_func._create_pandas_udf_return_schema_from_list() + == "`test_func_col1_0` bigint, `test_func_col1_1` double, `test_func_col1_2` string, `test_func_col1_3` date, `test_func_col1_4` timestamp, `test_func_col1_5` timestamp, `test_func_col1_6` boolean" + ) + + def test_hopsworks_wrapper_single_output(self): + @hopsworks_udf(int) + def test_func(col1): + return col1 + 1 + + renaming_wrapper_function = test_func.hopsworksUdf_wrapper() + + test_dataframe = pd.DataFrame({"col1": [1, 2, 3, 4]}) + + result = renaming_wrapper_function(test_dataframe["col1"]) + + assert result.name == "test_func_col1_" + assert result.values.tolist() == [2, 3, 4, 5] + + def test_hopsworks_wrapper_multiple_output(self): + @hopsworks_udf([int, float]) + def test_func(col1, col2): + return pd.DataFrame({"out1": col1 + 1, "out2": col2 + 2}) + + renaming_wrapper_function = test_func.hopsworksUdf_wrapper() + + test_dataframe = pd.DataFrame( + {"column1": [1, 2, 3, 4], "column2": [10, 20, 30, 40]} + ) + + result = renaming_wrapper_function( + test_dataframe["column1"], test_dataframe["column2"] + ) + + assert all(result.columns == ["test_func_col1-col2_0", "test_func_col1-col2_1"]) + assert result.values.tolist() == [[2, 12], [3, 22], [4, 32], [5, 42]] + + def test_HopsworkUDf_call_one_argument(self): + @hopsworks_udf(int) + def test_func(col1): + return col1 + 1 + + assert test_func.transformation_features == ["col1"] + assert test_func.statistics_features == [] + + assert test_func("new_feature").transformation_features == ["new_feature"] + assert test_func("new_feature").statistics_features == [] + + def test_HopsworkUDf_call_one_argument_statistics(self): + @hopsworks_udf(int) + def test_func(col1, statistics_col1): + return col1 + statistics_col1 + + assert test_func.transformation_features == ["col1"] + assert test_func.statistics_features == ["col1"] + + assert test_func("new_feature").transformation_features == ["new_feature"] + assert test_func("new_feature").statistics_features == ["new_feature"] + + def test_HopsworkUDf_call_multiple_argument_statistics(self): + @hopsworks_udf(int) + def test_func(col1, statistics_col1, col2, col3, statistics_col3): + return col1 + statistics_col1 + + assert test_func.transformation_features == ["col1", "col2", "col3"] + assert test_func.statistics_features == ["col1", "col3"] + + assert test_func("f1", "f2", "f3").transformation_features == ["f1", "f2", "f3"] + assert test_func("f1", "f2", "f3").statistics_features == ["f1", "f3"] From 594640cfb5d6f70fb1a2111232a1b16f101fe8fd Mon Sep 17 00:00:00 2001 From: manu-sj Date: Mon, 13 May 2024 15:00:31 +0200 Subject: [PATCH 22/58] correcting merge for vector server --- .../core/transformation_function_engine.py | 15 -- python/hsfs/core/vector_server.py | 20 +- .../tests/fixtures/feature_view_fixtures.json | 228 ++++++++++++++++++ 3 files changed, 233 insertions(+), 30 deletions(-) diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py index 2396cb1a03..0384e05ac9 100644 --- a/python/hsfs/core/transformation_function_engine.py +++ b/python/hsfs/core/transformation_function_engine.py @@ -29,21 +29,6 @@ from hsfs.statistics import Statistics from hsfs.transformation_function import TransformationFunction -from hsfs import ( - feature_view, - statistics, - training_dataset, - training_dataset_feature, - transformation_function_attached, - util, -) -from hsfs.core import ( - feature_view_api, - statistics_api, - statistics_engine, - transformation_function_api, -) -from hsfs.core.builtin_transformation_function import BuiltInTransformationFunction class TransformationFunctionEngine: BUILTIN_FN_NAMES = [ diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py index c6cd5959bd..1ef1df0854 100755 --- a/python/hsfs/core/vector_server.py +++ b/python/hsfs/core/vector_server.py @@ -38,19 +38,14 @@ from hsfs import ( training_dataset_feature as tdf_mod, ) -from hsfs import ( - transformation_function_attached as tfa_mod, -) from hsfs.client import exceptions, online_store_rest_client from hsfs.core import ( online_store_rest_client_engine, online_store_sql_engine, -) -from hsfs.core import ( transformation_function_engine as tf_engine_mod, + transformation_functions ) - HAS_FASTAVRO = False try: from fastavro import schemaless_reader @@ -104,7 +99,6 @@ def __init__( feat.name for feat in features if feat.inference_helper_column ] self._transformed_feature_vector_col_name: List[str] = None - self._skip_fg_ids = skip_fg_ids or set() self._serving_keys = serving_keys or [] self._required_serving_keys = [] @@ -112,9 +106,8 @@ def __init__( self._transformation_function_engine = ( tf_engine_mod.TransformationFunctionEngine(feature_store_id) ) - self._transformation_functions: Dict[ - str, tfa_mod.TransformationFunctionAttached - ] = {} + self._transformation_functions: List[transformation_functions.TransformationFunction] = [] + self._sql_client = None self._rest_client_engine = None @@ -301,7 +294,6 @@ def get_feature_vectors( """Assembles serving vector from online feature store.""" if passed_features is None: passed_features = [] - # Assertions on passed_features and vector_db_features assert ( passed_features is None @@ -573,7 +565,7 @@ def get_inference_helpers( return self.handle_feature_vector_return_type( batch_results, batch=True, inference_helper=True, return_type=return_type ) - + def which_client_and_ensure_initialised( self, force_rest_client: bool, force_sql_client: bool @@ -1005,9 +997,7 @@ def per_serving_key_features(self) -> Dict[str, set[str]]: @property def transformation_functions( self, - ) -> Dict[str, tfa_mod.TransformationFunctionAttached]: - if self._transformation_functions is None: - self._transformation_functions = {} + ) -> Optional[List[transformation_functions.TransformationFunction]]: return self._transformation_functions @property diff --git a/python/tests/fixtures/feature_view_fixtures.json b/python/tests/fixtures/feature_view_fixtures.json index e515e0d0df..da5c7766ed 100644 --- a/python/tests/fixtures/feature_view_fixtures.json +++ b/python/tests/fixtures/feature_view_fixtures.json @@ -762,5 +762,233 @@ } ] } + }, + "get_transformations": { + "response": { + "name": "test_name", + "query": { + "left_feature_group": { + "type": "cachedFeaturegroupDTO", + "validation_type": "test_validation_type", + "created": "2022-08-01T11:07:55Z", + "creator": { + "email": "admin@hopsworks.ai", + "firstName": "Admin", + "lastName": "Admin", + "maxNumProjects": 0, + "numActiveProjects": 0, + "numRemainingProjects": 0, + "status": 0, + "testUser": false, + "tos": false, + "toursState": 0, + "twoFactor": false + }, + "description": "test_description", + "featurestoreId": 67, + "featurestoreName": "test_featurestore", + "id": 15, + "location": "hopsfs://10.0.2.15:8020/apps/hive/warehouse/test_featurestore.db/fg_test_1", + "name": "fg_test", + "statisticsConfig": { + "columns": [], + "correlations": false, + "enabled": true, + "exactUniqueness": false, + "histograms": false + }, + "version": 1, + "features": [ + { + "defaultValue": null, + "featureGroupId": 15, + "hudiPrecombineKey": true, + "name": "intt", + "onlineType": "int", + "partition": false, + "primary": true, + "type": "int" + }, + { + "defaultValue": null, + "featureGroupId": 15, + "hudiPrecombineKey": false, + "name": "stringt", + "onlineType": "varchar(1000)", + "partition": false, + "primary": false, + "type": "string" + } + ], + "onlineTopicName": "119_15_fg_test_1_onlinefs", + "onlineEnabled": true, + "timeTravelFormat": "HUDI" + }, + "left_features": ["intt"], + "feature_store_name": "test_feature_store_name", + "feature_store_id": 67, + "left_feature_group_start_time": "test_start_time", + "left_feature_group_end_time": "test_end_time", + "joins": [ + { + "query": { + "left_feature_group": { + "type": "cachedFeaturegroupDTO", + "validation_type": "test_validation_type", + "created": "2022-08-01T11:07:55Z", + "creator": { + "email": "admin@hopsworks.ai", + "firstName": "Admin", + "lastName": "Admin", + "maxNumProjects": 0, + "numActiveProjects": 0, + "numRemainingProjects": 0, + "status": 0, + "testUser": false, + "tos": false, + "toursState": 0, + "twoFactor": false + }, + "description": "test_description", + "featurestoreId": 67, + "featurestoreName": "test_featurestore", + "id": 15, + "location": "hopsfs://10.0.2.15:8020/apps/hive/warehouse/test_featurestore.db/fg_test_1", + "name": "fg_test", + "statisticsConfig": { + "columns": [], + "correlations": false, + "enabled": true, + "exactUniqueness": false, + "histograms": false + }, + "version": 1, + "features": [ + { + "defaultValue": null, + "featureGroupId": 15, + "hudiPrecombineKey": true, + "name": "intt", + "onlineType": "int", + "partition": false, + "primary": true, + "type": "int" + }, + { + "defaultValue": null, + "featureGroupId": 15, + "hudiPrecombineKey": false, + "name": "stringt", + "onlineType": "varchar(1000)", + "partition": false, + "primary": false, + "type": "string" + } + ], + "onlineTopicName": "119_15_fg_test_1_onlinefs", + "onlineEnabled": true, + "timeTravelFormat": "HUDI" + }, + "left_features": ["intt"], + "feature_store_name": "test_feature_store_name", + "feature_store_id": 67, + "left_feature_group_start_time": "test_left_feature_group_start_time", + "left_feature_group_end_time": "test_left_feature_group_end_time", + "joins": [], + "filter": null + }, + "on": ["test_on"], + "left_on": ["test_left_on"], + "right_on": ["test_right_on"], + "join_type": "inner", + "prefix": "test_prefix" + } + ], + "filter": { + "condition": "test_condition", + "value": "test_value", + "feature": { + "defaultValue": null, + "featureGroupId": 15, + "hudiPrecombineKey": true, + "name": "intt", + "onlineType": "int", + "partition": false, + "primary": true, + "type": "int" + } + } + }, + "featurestore_id": 5, + "id": 11, + "version": 1, + "description": "test_description", + "transformation_functions": [ + { + "id" : 1, + "version": 2, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n return data1 + statistics_data1.mean\n", + "name": "add_mean_fs", + "outputTypes":"double", + "transformationFeatures":"data" + } + }, + { + "id" : 2, + "version": 1, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", + "name": "add_one_fs", + "outputTypes":"double", + "transformationFeatures":"col1" + } + } + ], + "features": [ + { + "name": "intt", + "label": "t", + "featuregroup": { + "type": "featuregroupDTO", + "featurestoreId": 67, + "version": 1, + "name": "fg_test", + "id": 15, + "statisticsConfig": { + "enabled": true, + "histograms": false, + "correlations": false, + "exactUniqueness": false, + "columns": [] + }, + "onlineEnabled": false, + "deprecated": false + } + }, + { + "name": "stringt", + "featurestoreId": 67, + "featuregroup": { + "type": "featuregroupDTO", + "featurestoreId": 67, + "version": 1, + "name": "fg_test", + "id": 15, + "statisticsConfig": { + "enabled": true, + "histograms": false, + "correlations": false, + "exactUniqueness": false, + "columns": [] + }, + "onlineEnabled": false, + "deprecated": false + } + } + ] + } } } From f0e9540d6927a1e0381efbe8b4295c22a0db25dc Mon Sep 17 00:00:00 2001 From: manu-sj Date: Mon, 13 May 2024 15:03:25 +0200 Subject: [PATCH 23/58] reformatting with ruff --- python/hsfs/engine/python.py | 20 +++++++++++++++++--- python/hsfs/engine/spark.py | 12 ++++++++++-- python/hsfs/training_dataset_feature.py | 1 + 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py index 6d213f7778..03daa581df 100644 --- a/python/hsfs/engine/python.py +++ b/python/hsfs/engine/python.py @@ -30,7 +30,17 @@ from datetime import datetime, timezone from io import BytesIO from pathlib import Path -from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union, TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + List, + Literal, + Optional, + Tuple, + Union, +) import avro import boto3 @@ -209,7 +219,6 @@ def _sql_offline( hive_config: Optional[Dict[str, Any]] = None, arrow_flight_config: Optional[Dict[str, Any]] = None, ) -> Union[pd.DataFrame, pl.DataFrame]: - self._validate_dataframe_type(dataframe_type) if isinstance(sql_query, dict) and "query_string" in sql_query: result_df = util.run_with_loading_animation( @@ -513,7 +522,12 @@ def show( sql_query, feature_store, online_conn, "default", read_options or {} ).head(n) - def read_vector_db(self, feature_group: "hsfs.feature_group.FeatureGroup", n: int =None, dataframe_type: str="default") -> Union[pd.DataFrame, pl.DataFrame, np.ndarray, List[List[Any]]]: + def read_vector_db( + self, + feature_group: "hsfs.feature_group.FeatureGroup", + n: int = None, + dataframe_type: str = "default", + ) -> Union[pd.DataFrame, pl.DataFrame, np.ndarray, List[List[Any]]]: dataframe_type = dataframe_type.lower() self._validate_dataframe_type(dataframe_type) diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py index f1f6fcb69a..48bce2e351 100644 --- a/python/hsfs/engine/spark.py +++ b/python/hsfs/engine/spark.py @@ -23,13 +23,14 @@ import shutil import warnings from datetime import date, datetime, timezone -from typing import Any, List, Optional, TypeVar, Union, TYPE_CHECKING, Dict +from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeVar, Union import avro import numpy as np import pandas as pd import tzlocal + if TYPE_CHECKING: from hsfs.constructor.query import Query from hsfs.feature_view import FeatureView @@ -158,7 +159,14 @@ def show(self, sql_query, feature_store, n, online_conn, read_options=None): sql_query, feature_store, online_conn, "default", read_options ).show(n) - def read_vector_db(self, feature_group: fg_mod.FeatureGroup, n: int =None, dataframe_type: str="default") -> Union[pd.DataFrame, np.ndarray, List[List[Any]], TypeVar("pyspark.sql.DataFrame")]: + def read_vector_db( + self, + feature_group: fg_mod.FeatureGroup, + n: int = None, + dataframe_type: str = "default", + ) -> Union[ + pd.DataFrame, np.ndarray, List[List[Any]], TypeVar("pyspark.sql.DataFrame") + ]: results = VectorDbClient.read_feature_group(feature_group, n) feature_names = [f.name for f in feature_group.features] dataframe_type = dataframe_type.lower() diff --git a/python/hsfs/training_dataset_feature.py b/python/hsfs/training_dataset_feature.py index c444e833c7..a06637abe2 100644 --- a/python/hsfs/training_dataset_feature.py +++ b/python/hsfs/training_dataset_feature.py @@ -20,6 +20,7 @@ from hsfs import feature_group as feature_group_mod from hsfs import util + class TrainingDatasetFeature: def __init__( self, From 60726423f7e18352008db7ef1467e2643954c088 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Mon, 13 May 2024 15:43:17 +0200 Subject: [PATCH 24/58] fixing vector server --- .../core/transformation_function_engine.py | 43 +++++++++++++++++++ python/hsfs/core/vector_server.py | 10 ++--- 2 files changed, 47 insertions(+), 6 deletions(-) diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py index 0384e05ac9..773380a113 100644 --- a/python/hsfs/core/transformation_function_engine.py +++ b/python/hsfs/core/transformation_function_engine.py @@ -139,6 +139,49 @@ def compute_transformation_fn_statistics( feature_view_obj=feature_view_obj, ) + @staticmethod + def get_ready_to_use_transformation_fns( + feature_view: FeatureView, + training_dataset_version: Optional[int] = None, + ) -> List[TransformationFunction]: + # get attached transformation functions + transformation_functions = ( + feature_view._feature_view_engine.get_attached_transformation_fn() + ) + is_stat_required = any( + [tf.hopsworks_udf.statistics_required for tf in transformation_functions] + ) + if not is_stat_required: + td_tffn_stats = None + else: + # if there are any transformation functions that require statistics get related statistics and + # populate with relevant arguments + # there should be only one statistics object with before_transformation=true + if training_dataset_version is None: + raise ValueError( + "Training data version is required for transformation. Call `feature_view.init_serving(version)` " + "or `feature_view.init_batch_scoring(version)` to pass the training dataset version." + "Training data can be created by `feature_view.create_training_data` or `feature_view.training_data`." + ) + td_tffn_stats = feature_view._statistics_engine.get( + feature_view, + before_transformation=True, + training_dataset_version=training_dataset_version, + ) + + if is_stat_required and td_tffn_stats is None: + raise ValueError( + "No statistics available for initializing transformation functions." + + "Training data can be created by `feature_view.create_training_data` or `feature_view.training_data`." + ) + + if is_stat_required: + for transformation_function in transformation_functions: + transformation_function.hopsworks_udf.transformation_statistics = ( + td_tffn_stats.feature_descriptive_statistics + ) + return transformation_functions + @staticmethod def compute_and_set_feature_statistics( training_dataset: training_dataset.TrainingDataset, diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py index 1ef1df0854..49892db1c4 100755 --- a/python/hsfs/core/vector_server.py +++ b/python/hsfs/core/vector_server.py @@ -181,14 +181,12 @@ def init_batch_scoring( def init_transformation( self, - entity: Union[feature_view.FeatureView, training_dataset.TrainingDataset], + entity: Union[feature_view.FeatureView], ): # attach transformation functions - self._transformation_functions = ( - self.transformation_function_engine.get_ready_to_use_transformation_fns( - entity, - self._training_dataset_version, - ) + self._transformation_functions = transformation_function_engine.TransformationFunctionEngine.get_ready_to_use_transformation_fns( + entity, + self._training_dataset_version, ) def setup_sql_client( From f46f0b71102b87caf5f1b78fa88cb97a16673af7 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Mon, 13 May 2024 16:29:15 +0200 Subject: [PATCH 25/58] fixing docs --- python/hsfs/core/feature_view_engine.py | 20 ++--- .../core/transformation_function_engine.py | 77 ++++++++++--------- python/hsfs/core/vector_server.py | 21 +++-- python/hsfs/engine/python.py | 22 ++---- python/hsfs/engine/spark.py | 49 ++++++------ python/hsfs/feature_store.py | 7 +- python/hsfs/feature_view.py | 5 +- 7 files changed, 96 insertions(+), 105 deletions(-) diff --git a/python/hsfs/core/feature_view_engine.py b/python/hsfs/core/feature_view_engine.py index 19ea348b97..070be9b821 100644 --- a/python/hsfs/core/feature_view_engine.py +++ b/python/hsfs/core/feature_view_engine.py @@ -17,7 +17,7 @@ import datetime import warnings -from typing import TYPE_CHECKING, List, Optional, Union +from typing import List, Optional, Union from hsfs import ( client, @@ -25,6 +25,7 @@ feature_group, feature_view, training_dataset_feature, + transformation_function, util, ) from hsfs.client import exceptions @@ -41,11 +42,6 @@ from hsfs.training_dataset_split import TrainingDatasetSplit -if TYPE_CHECKING: - from hsfs.feature_view import FeatureView - from hsfs.transformation_function import TransformationFunction - - class FeatureViewEngine: ENTITY_TYPE = "featureview" _TRAINING_DATA_API_PATH = "trainingdatasets" @@ -68,7 +64,9 @@ def __init__(self, feature_store_id): ) self._query_constructor_api = query_constructor_api.QueryConstructorApi() - def save(self, feature_view_obj: FeatureView) -> FeatureView: + def save( + self, feature_view_obj: feature_view.FeatureView + ) -> feature_view.FeatureView: """ Save a feature view to the backend. @@ -135,7 +133,9 @@ def save(self, feature_view_obj: FeatureView) -> FeatureView: ) return updated_fv - def update(self, feature_view_obj: FeatureView) -> FeatureView: + def update( + self, feature_view_obj: feature_view.FeatureView + ) -> feature_view.FeatureView: """ Update the feature view object saved in the backend @@ -150,7 +150,7 @@ def update(self, feature_view_obj: FeatureView) -> FeatureView: def get( self, name: str, version: int = None - ) -> Union[FeatureView, List[FeatureView]]: + ) -> Union[feature_view.FeatureView, List[feature_view.FeatureView]]: """ Get a feature view form the backend using name or using name and version. @@ -267,7 +267,7 @@ def get_batch_query_string( def get_attached_transformation_fn( self, name: str, version: int - ) -> List[TransformationFunction]: + ) -> List[transformation_function.TransformationFunction]: """ Get transformation functions attached to a feature view form the backend diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py index 773380a113..128c98e6cb 100644 --- a/python/hsfs/core/transformation_function_engine.py +++ b/python/hsfs/core/transformation_function_engine.py @@ -15,21 +15,14 @@ # from __future__ import annotations -from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union +from typing import Dict, List, Optional, Set, TypeVar, Union -from hsfs import training_dataset +import pandas as pd +import polars as pl +from hsfs import feature_view, statistics, training_dataset, transformation_function from hsfs.core import transformation_function_api -if TYPE_CHECKING: - import pandas as pd - import polars as pl - import pyspark.sql as ps - from hsfs.feature_view import FeatureView - from hsfs.statistics import Statistics - from hsfs.transformation_function import TransformationFunction - - class TransformationFunctionEngine: BUILTIN_FN_NAMES = [ "min_max_scaler", @@ -50,13 +43,13 @@ def __init__(self, feature_store_id: int): ) def save( - self, transformation_fn_instance: TransformationFunction - ) -> TransformationFunction: + self, transformation_fn_instance: transformation_function.TransformationFunction + ) -> transformation_function.TransformationFunction: """ Save a transformation function into the feature store. # Argument - transformation_fn_instance `TransformationFunction`: The transformation function to be saved into the feature store. + transformation_fn_instance `transformation_function.TransformationFunction`: The transformation function to be saved into the feature store. """ self._transformation_function_api.register_transformation_fn( transformation_fn_instance @@ -64,7 +57,10 @@ def save( def get_transformation_fn( self, name: str, version: Optional[int] = None - ) -> Union[TransformationFunction, List[TransformationFunction]]: + ) -> Union[ + transformation_function.TransformationFunction, + List[transformation_function.TransformationFunction], + ]: """ Retrieve a transformation function from the feature store. @@ -75,7 +71,7 @@ def get_transformation_fn( name ` Optional[str]`: The name of the transformation function to be retrieved. version `Optional[int]`: The version of the transformation function to be retrieved. # Returns - `Union[TransformationFunction, List[TransformationFunction]]` : A transformation function if name and version is provided. A list of transformation functions if only name is provided. + `Union[transformation_function.TransformationFunction, List[transformation_function.TransformationFunction]]` : A transformation function if name and version is provided. A list of transformation functions if only name is provided. """ transformation_fn_instances = ( @@ -83,12 +79,14 @@ def get_transformation_fn( ) return transformation_fn_instances - def get_transformation_fns(self) -> List[TransformationFunction]: + def get_transformation_fns( + self, + ) -> List[transformation_function.TransformationFunction]: """ Get all the transformation functions in the feature store # Returns - `List[TransformationFunction]` : A list of transformation functions. + `List[transformation_function.TransformationFunction]` : A list of transformation functions. """ transformation_fn_instances = ( self._transformation_function_api.get_transformation_fn( @@ -102,12 +100,15 @@ def get_transformation_fns(self) -> List[TransformationFunction]: transformation_fns.append(transformation_fn_instance) return transformation_fns - def delete(self, transformation_function_instance: TransformationFunction) -> None: + def delete( + self, + transformation_function_instance: transformation_function.TransformationFunction, + ) -> None: """ Delete a transformation function from the feature store. # Arguments - transformation_function_instance `TransformationFunction`: The transformation function to be removed from the feature store. + transformation_function_instance `transformation_function.TransformationFunction`: The transformation function to be removed from the feature store. """ self._transformation_function_api.delete(transformation_function_instance) @@ -116,9 +117,11 @@ def compute_transformation_fn_statistics( training_dataset_obj: training_dataset.TrainingDataset, statistics_features: List[str], label_encoder_features: List[str], - feature_dataframe: Union[pd.DataFrame, pl.DataFrame, ps.DataFrame], - feature_view_obj: FeatureView, - ) -> Statistics: + feature_dataframe: Union[ + pd.DataFrame, pl.DataFrame, TypeVar("pyspark.sql.DataFrame") + ], + feature_view_obj: feature_view.FeatureView, + ) -> statistics.Statistics: """ Compute the statistics required for a training dataset object. @@ -141,9 +144,9 @@ def compute_transformation_fn_statistics( @staticmethod def get_ready_to_use_transformation_fns( - feature_view: FeatureView, + feature_view: feature_view.FeatureView, training_dataset_version: Optional[int] = None, - ) -> List[TransformationFunction]: + ) -> List[transformation_function.TransformationFunction]: # get attached transformation functions transformation_functions = ( feature_view._feature_view_engine.get_attached_transformation_fn() @@ -185,10 +188,12 @@ def get_ready_to_use_transformation_fns( @staticmethod def compute_and_set_feature_statistics( training_dataset: training_dataset.TrainingDataset, - feature_view_obj: FeatureView, + feature_view_obj: feature_view.FeatureView, dataset: Union[ - Dict[str, Union[pd.DataFrame, pl.DataFrame, ps.DataFrame]], - Union[pd.DataFrame, pl.DataFrame, ps.DataFrame], + Dict[ + str, Union[pd.DataFrame, pl.DataFrame, TypeVar("pyspark.sql.DataFrame")] + ], + Union[pd.DataFrame, pl.DataFrame, TypeVar("pyspark.sql.DataFrame")], ], ) -> None: """ @@ -204,10 +209,8 @@ def compute_and_set_feature_statistics( statistics_features: Set[str] = set() # Finding the features for which statistics is required - for transformation_function in feature_view_obj.transformation_functions: - statistics_features.update( - transformation_function.hopsworks_udf.statistics_features - ) + for tf in feature_view_obj.transformation_functions: + statistics_features.update(tf.hopsworks_udf.statistics_features) if statistics_features: # compute statistics on training data if training_dataset.splits: @@ -233,15 +236,15 @@ def compute_and_set_feature_statistics( ) # Set statistics computed in the hopsworks UDF - for transformation_function in feature_view_obj.transformation_functions: - transformation_function.hopsworks_udf.transformation_statistics = ( + for tf in feature_view_obj.transformation_functions: + tf.hopsworks_udf.transformation_statistics = ( stats.feature_descriptive_statistics ) @staticmethod def get_and_set_feature_statistics( training_dataset: training_dataset.TrainingDataset, - feature_view_obj: FeatureView, + feature_view_obj: feature_view.FeatureView, training_dataset_version: int = None, ) -> None: """ @@ -277,7 +280,7 @@ def get_and_set_feature_statistics( "No statistics available for initializing transformation functions." ) - for transformation_function in feature_view_obj.transformation_functions: - transformation_function.hopsworks_udf.transformation_statistics = ( + for tf in feature_view_obj.transformation_functions: + tf.hopsworks_udf.transformation_statistics = ( td_tffn_stats.feature_descriptive_statistics ) diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py index 49892db1c4..94468f1dde 100755 --- a/python/hsfs/core/vector_server.py +++ b/python/hsfs/core/vector_server.py @@ -43,13 +43,13 @@ online_store_rest_client_engine, online_store_sql_engine, transformation_function_engine as tf_engine_mod, - transformation_functions + transformation_function ) + HAS_FASTAVRO = False try: from fastavro import schemaless_reader - HAS_FASTAVRO = True except ImportError: from avro.io import BinaryDecoder @@ -106,7 +106,7 @@ def __init__( self._transformation_function_engine = ( tf_engine_mod.TransformationFunctionEngine(feature_store_id) ) - self._transformation_functions: List[transformation_functions.TransformationFunction] = [] + self._transformation_functions: List[transformation_function.TransformationFunction] = [] self._sql_client = None @@ -184,7 +184,7 @@ def init_transformation( entity: Union[feature_view.FeatureView], ): # attach transformation functions - self._transformation_functions = transformation_function_engine.TransformationFunctionEngine.get_ready_to_use_transformation_fns( + self._transformation_functions = tf_engine_mod.TransformationFunctionEngine.get_ready_to_use_transformation_fns( entity, self._training_dataset_version, ) @@ -564,7 +564,6 @@ def get_inference_helpers( batch_results, batch=True, inference_helper=True, return_type=return_type ) - def which_client_and_ensure_initialised( self, force_rest_client: bool, force_sql_client: bool ) -> str: @@ -628,14 +627,12 @@ def _set_default_client( def apply_transformation(self, row_dict: dict): _logger.debug("Applying transformation functions to : %s", matching_keys) - for transformation_function in self.transformation_functions: + for tf in self.transformation_functions: features = [ pd.Series(row_dict[feature]) - for feature in transformation_function.hopsworks_udf.transformation_features + for feature in tf.hopsworks_udf.transformation_features ] - transformed_result = transformation_function.hopsworks_udf.get_udf()( - *features - ) + transformed_result = tf.hopsworks_udf.get_udf()(*features) if isinstance(transformed_result, pd.Series): row_dict[transformed_result.name] = transformed_result.values[0] else: @@ -678,6 +675,7 @@ def build_complex_feature_decoders(self) -> Dict[str, Callable]: for f in self._features if f.is_complex() } + if len(complex_feature_schemas) == 0: return {} else: @@ -869,7 +867,6 @@ def identify_missing_features_pre_fetch( passed_feature_names = passed_feature_names.union( vector_db_features.keys() ) - neither_fetched_nor_passed = fetched_features.difference( passed_feature_names ) @@ -912,7 +909,7 @@ def build_per_serving_key_features( ] ) return per_serving_key_features - + @property def sql_client( self, diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py index 03daa581df..e6d55a8238 100644 --- a/python/hsfs/engine/python.py +++ b/python/hsfs/engine/python.py @@ -31,7 +31,6 @@ from io import BytesIO from pathlib import Path from typing import ( - TYPE_CHECKING, Any, Callable, Dict, @@ -58,6 +57,7 @@ feature, feature_store, feature_view, + transformation_function, util, ) from hsfs import storage_connector as sc @@ -90,10 +90,6 @@ from tqdm.auto import tqdm -if TYPE_CHECKING: - from hsfs.transformation_function import TransformationFunction - - # Disable pyhive INFO logging logging.getLogger("pyhive").setLevel(logging.WARNING) @@ -1296,14 +1292,14 @@ def add_file(self, file: Optional[str]) -> Optional[str]: def _apply_transformation_function( self, - transformation_functions: List[TransformationFunction], + transformation_functions: List[transformation_function.TransformationFunction], dataset: Union[pd.DataFrame, pl.DataFrame], ) -> Union[pd.DataFrame, pl.DataFrame]: """ Apply transformation function to the dataframe. # Arguments - transformation_functions `List[TransformationFunction]` : List of transformation functions. + transformation_functions `List[transformation_function.TransformationFunction]` : List of transformation functions. dataset `Union[pd.DataFrame, pl.DataFrame]`: A pandas or polars dataframe. # Returns `DataFrame`: A pandas dataframe with the transformed data. @@ -1323,8 +1319,8 @@ def _apply_transformation_function( else: dataset = dataset.to_pandas(use_pyarrow_extension_array=False) - for transformation_function in transformation_functions: - hopsworks_udf = transformation_function.hopsworks_udf + for tf in transformation_functions: + hopsworks_udf = tf.hopsworks_udf missing_features = set(hopsworks_udf.transformation_features) - set( dataset.columns ) @@ -1333,17 +1329,15 @@ def _apply_transformation_function( f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly." ) - transformed_features.update( - transformation_function.hopsworks_udf.transformation_features - ) + transformed_features.update(tf.hopsworks_udf.transformation_features) dataset = pd.concat( [ dataset, - transformation_function.hopsworks_udf.get_udf()( + tf.hopsworks_udf.get_udf()( *( [ dataset[feature] - for feature in transformation_function.hopsworks_udf.transformation_features + for feature in tf.hopsworks_udf.transformation_features ] ) ), diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py index 48bce2e351..c462efa641 100644 --- a/python/hsfs/engine/spark.py +++ b/python/hsfs/engine/spark.py @@ -23,19 +23,13 @@ import shutil import warnings from datetime import date, datetime, timezone -from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeVar, Union +from typing import Any, Dict, List, Optional, TypeVar, Union import avro import numpy as np import pandas as pd import tzlocal - - -if TYPE_CHECKING: - from hsfs.constructor.query import Query - from hsfs.feature_view import FeatureView - from hsfs.training_dataset import TrainingDataset - from hsfs.transformation_function import TransformationFunction +from hsfs.constructor import query # in case importing in %%local from hsfs.core.vector_db_client import VectorDbClient @@ -89,11 +83,18 @@ def iteritems(self): DataContextConfig, InMemoryStoreBackendDefaults, ) -from hsfs import client, feature, training_dataset_feature, util +from hsfs import ( + client, + feature, + feature_view, + training_dataset, + training_dataset_feature, + transformation_function, + util, +) from hsfs import feature_group as fg_mod from hsfs.client import hopsworks from hsfs.client.exceptions import FeatureStoreException -from hsfs.constructor import query from hsfs.core import ( dataset_api, delta_engine, @@ -556,9 +557,9 @@ def _online_fg_to_avro(self, feature_group, dataframe): def get_training_data( self, - training_dataset: TrainingDataset, - feature_view_obj: FeatureView, - query_obj: Query, + training_dataset: training_dataset.TrainingDataset, + feature_view_obj: feature_view.FeatureView, + query_obj: query.Query, read_options: Dict[str, Any], dataframe_type: str, training_dataset_version: int = None, @@ -607,12 +608,12 @@ def drop_columns(self, df, drop_cols): def write_training_dataset( self, - training_dataset: TrainingDataset, - query_obj: Query, + training_dataset: training_dataset.TrainingDataset, + query_obj: query.Query, user_write_options: Dict[str, Any], save_mode: str, read_options: Dict[str, Any] = None, - feature_view_obj: FeatureView = None, + feature_view_obj: feature_view.FeatureView = None, to_df: bool = False, training_dataset_version: Optional[int] = None, ): @@ -844,7 +845,9 @@ def _write_training_dataset_splits( write_options, save_mode, to_df=False, - transformation_functions: List[TransformationFunction] = None, + transformation_functions: List[ + transformation_function.TransformationFunction + ] = None, ): for split_name, feature_dataframe in feature_dataframes.items(): split_path = training_dataset.location + "/" + str(split_name) @@ -1226,7 +1229,9 @@ def add_cols_to_delta_table(self, feature_group, new_features): ).save(feature_group.location) def _apply_transformation_function( - self, transformation_functions: List[TransformationFunction], dataset: DataFrame + self, + transformation_functions: List[transformation_function.TransformationFunction], + dataset: DataFrame, ): """ Apply transformation function to the dataframe. @@ -1244,8 +1249,8 @@ def _apply_transformation_function( transformation_features = [] output_col_names = [] explode_name = [] - for transformation_function in transformation_functions: - hopsworks_udf = transformation_function.hopsworks_udf + for tf in transformation_functions: + hopsworks_udf = tf.hopsworks_udf missing_features = set(hopsworks_udf.transformation_features) - set( dataset.columns ) @@ -1255,9 +1260,7 @@ def _apply_transformation_function( f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly." ) - transformed_features.update( - transformation_function.hopsworks_udf.transformation_features - ) + transformed_features.update(tf.hopsworks_udf.transformation_features) pandas_udf = hopsworks_udf.get_udf() output_col_name = hopsworks_udf.output_column_names[0] diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py index 41d1a754ff..848252cb64 100644 --- a/python/hsfs/feature_store.py +++ b/python/hsfs/feature_store.py @@ -18,7 +18,7 @@ import datetime import warnings -from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeVar, Union +from typing import Any, Dict, List, Optional, TypeVar, Union import great_expectations as ge import humps @@ -48,14 +48,11 @@ ) from hsfs.decorators import typechecked from hsfs.embedding import EmbeddingIndex +from hsfs.hopsworks_udf import HopsworksUdf from hsfs.statistics_config import StatisticsConfig from hsfs.transformation_function import TransformationFunction -if TYPE_CHECKING: - from hsfs.hopsworks_udf import HopsworksUdf - - @typechecked class FeatureStore: DEFAULT_VERSION = 1 diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py index 5b90fabfc2..ad53317d75 100644 --- a/python/hsfs/feature_view.py +++ b/python/hsfs/feature_view.py @@ -54,16 +54,13 @@ from hsfs.core.vector_db_client import VectorDbClient from hsfs.decorators import typechecked from hsfs.feature import Feature +from hsfs.hopsworks_udf import HopsworksUdf from hsfs.statistics import Statistics from hsfs.statistics_config import StatisticsConfig from hsfs.training_dataset_split import TrainingDatasetSplit from hsfs.transformation_function import TransformationFunction -if TYPE_CHECKING: - from hsfs.hopsworks_udf import HopsworksUdf - - _logger = logging.getLogger(__name__) TrainingDatasetDataFrameTypes = Union[ From 07348d52bfaa664439714b2cd9a72b6a90208462 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Tue, 14 May 2024 08:26:59 +0200 Subject: [PATCH 26/58] fixing vector server --- .../core/transformation_function_engine.py | 11 ++++++++++- python/hsfs/core/vector_server.py | 19 +++++++++++++++---- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py index 128c98e6cb..304b0fcabb 100644 --- a/python/hsfs/core/transformation_function_engine.py +++ b/python/hsfs/core/transformation_function_engine.py @@ -149,8 +149,17 @@ def get_ready_to_use_transformation_fns( ) -> List[transformation_function.TransformationFunction]: # get attached transformation functions transformation_functions = ( - feature_view._feature_view_engine.get_attached_transformation_fn() + feature_view._feature_view_engine.get_attached_transformation_fn( + feature_view.name, feature_view.version + ) ) + + transformation_functions = ( + [transformation_functions] + if not isinstance(transformation_functions, list) + else transformation_functions + ) + is_stat_required = any( [tf.hopsworks_udf.statistics_required for tf in transformation_functions] ) diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py index 94468f1dde..ed168d6295 100755 --- a/python/hsfs/core/vector_server.py +++ b/python/hsfs/core/vector_server.py @@ -429,7 +429,7 @@ def assemble_feature_vector( _logger.debug("Assembled and transformed dict feature vector: %s", result_dict) - return [result_dict.get(fname, None) for fname in self.feature_vector_col_name] + return [result_dict.get(fname, None) for fname in self.transformed_feature_vector_col_name] def handle_feature_vector_return_type( self, @@ -909,7 +909,7 @@ def build_per_serving_key_features( ] ) return per_serving_key_features - + @property def sql_client( self, @@ -1062,9 +1062,20 @@ def default_client(self, default_client: Literal["rest", "sql"]): def transformed_feature_vector_col_name(self): if self._transformed_feature_vector_col_name is None: - self._transformed_feature_vector_col_name = self._feature_vector_col_name + transformation_features = [] + output_column_names = [] for transformation_function in self._transformation_functions: - self._transformed_feature_vector_col_name += ( + transformation_features += ( transformation_function.hopsworks_udf.transformation_features ) + output_column_names += ( + transformation_function.hopsworks_udf.output_column_names + ) + + self._transformed_feature_vector_col_name = [ + feature + for feature in self._feature_vector_col_name + if feature not in transformation_features + ] + self._transformed_feature_vector_col_name.extend(output_column_names) return self._transformed_feature_vector_col_name \ No newline at end of file From 41a02acd075cec78e0ba2f3d10735b46b2e9d544 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Tue, 14 May 2024 09:31:58 +0200 Subject: [PATCH 27/58] fixing building in transformations --- .../hsfs/core/transformation_function_engine.py | 10 ++++++++-- python/hsfs/hopsworks_udf.py | 17 +++++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py index 304b0fcabb..ddbaebe8e2 100644 --- a/python/hsfs/core/transformation_function_engine.py +++ b/python/hsfs/core/transformation_function_engine.py @@ -216,10 +216,16 @@ def compute_and_set_feature_statistics( dataset `Union[Dict[str, Union[pd.DataFrame, pl.DataFrame, ps.DataFrame]], Union[pd.DataFrame, pl.DataFrame, ps.DataFrame]]`: A dataframe that conqtains the training data or a dictionary that contains both the training and test data. """ statistics_features: Set[str] = set() + label_encoder_features: Set[str] = set() # Finding the features for which statistics is required for tf in feature_view_obj.transformation_functions: statistics_features.update(tf.hopsworks_udf.statistics_features) + if ( + tf.hopsworks_udf.function_name == "label_encoder" + or tf.hopsworks_udf.function_name == "one_hot_encoder" + ): + label_encoder_features.update(tf.hopsworks_udf.statistics_features) if statistics_features: # compute statistics on training data if training_dataset.splits: @@ -228,7 +234,7 @@ def compute_and_set_feature_statistics( TransformationFunctionEngine.compute_transformation_fn_statistics( training_dataset, list(statistics_features), - [], + list(label_encoder_features), dataset.get(training_dataset.train_split), feature_view_obj, ) @@ -238,7 +244,7 @@ def compute_and_set_feature_statistics( TransformationFunctionEngine.compute_transformation_fn_statistics( training_dataset, list(statistics_features), - [], + list(label_encoder_features), dataset, feature_view_obj, ) diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index 9b3b332812..049818d234 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -480,6 +480,19 @@ def __call__(self, *features: List[str]) -> "HopsworksUdf": udf.output_column_names = udf._get_output_column_names() return udf + def update_return_type_one_hot(self): + self._output_types = [ + self._output_types[0] + for _ in range( + len( + self.transformation_statistics[ + "statistics_feature" + ].extended_statistics["unique_values"] + ) + ) + ] + self.output_column_names = self._get_output_column_names() + def get_udf(self) -> Callable: """ Function that checks the current engine type and returns the appropriate UDF. @@ -490,6 +503,10 @@ def get_udf(self) -> Callable: # Returns `Callable`: Pandas UDF in the spark engine otherwise returns a python function for the UDF. """ + # Update the number of outputs for one hot encoder to match the number of unique values for the feature + if self.function_name == "one_hot_encoder": + self.update_return_type_one_hot() + if engine.get_type() in ["hive", "python", "training"]: return self.hopsworksUdf_wrapper() else: From 221560613f6afb2fb405b6d827fa5c2564dcfe11 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Tue, 14 May 2024 16:33:34 +0200 Subject: [PATCH 28/58] correcting get feature vector --- python/hsfs/core/vector_server.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py index ed168d6295..8e25e18632 100755 --- a/python/hsfs/core/vector_server.py +++ b/python/hsfs/core/vector_server.py @@ -637,9 +637,7 @@ def apply_transformation(self, row_dict: dict): row_dict[transformed_result.name] = transformed_result.values[0] else: for col in transformed_result: - row_dict[transformed_result.name] = transformed_result[col].values[ - 0 - ] + row_dict[col] = transformed_result[col].values[0] return row_dict def apply_return_value_handlers( From e1d7abe504f53f8c8b54b6ba3ed3b40558163deb Mon Sep 17 00:00:00 2001 From: manu-sj Date: Thu, 16 May 2024 11:13:34 +0200 Subject: [PATCH 29/58] adding missed changes for build in transformations --- python/hsfs/builtin_transformations.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/hsfs/builtin_transformations.py b/python/hsfs/builtin_transformations.py index d17ae6f1fa..35a26d137e 100644 --- a/python/hsfs/builtin_transformations.py +++ b/python/hsfs/builtin_transformations.py @@ -42,7 +42,7 @@ def robust_scaler( ) -# @hopsworks_udf(int) +@hopsworks_udf(int) def label_encoder( feature: pd.Series, statistics_feature: FeatureDescriptiveStatistics ) -> pd.Series: @@ -53,15 +53,16 @@ def label_encoder( return pd.Series([value_to_index[data] for data in feature]) +@hopsworks_udf(bool) def one_hot_encoder( feature: pd.Series, statistics_feature: FeatureDescriptiveStatistics ) -> pd.Series: unique_data = [ value for value in statistics_feature.extended_statistics["unique_values"] ] - print(statistics_feature.extended_statistics["unique_values"]) one_hot = pd.get_dummies(feature, dtype="bool") for data in unique_data: if data not in one_hot: one_hot[data] = False - return one_hot + # Sorting by columns so as to maintain consistency in column order. + return one_hot.reindex(sorted(one_hot.columns), axis=1) From 2d0bca3eba7ef9e344a962a030a17065c907269a Mon Sep 17 00:00:00 2001 From: manu-sj Date: Fri, 17 May 2024 12:56:33 +0200 Subject: [PATCH 30/58] shallow copying scope dictonary to not overwrite statistics variable for different udf's having same statistics parameter name --- python/hsfs/hopsworks_udf.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index 049818d234..0091e50481 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -433,8 +433,11 @@ def renaming_wrapper(*args): df = convert_timezone(df) return df""" ) + # injecting variables into scope used to execute wrapper function. - scope = __import__("__main__").__dict__ + + # Shallow copy of scope performed because updating statistics argument of scope must not affect other instances. + scope = __import__("__main__").__dict__.copy() if self.transformation_statistics is not None: scope.update(self.transformation_statistics) scope.update({"_output_col_names": self.output_column_names}) From 37f96fa9f7b3947f9909acd214852ca2245c5008 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Mon, 20 May 2024 15:38:11 +0200 Subject: [PATCH 31/58] adding deep copy to create multiple transfromation functions with different features --- python/hsfs/transformation_function.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py index b6ef060cb9..ce33a2b8d0 100644 --- a/python/hsfs/transformation_function.py +++ b/python/hsfs/transformation_function.py @@ -14,6 +14,7 @@ # from __future__ import annotations +import copy import json from typing import Any, Dict, List, Optional, Union @@ -129,8 +130,10 @@ def __call__(self, *features: List[str]) -> TransformationFunction: # Raises `FeatureStoreException: If the provided number of features do not match the number of arguments in the defined UDF or if the provided feature names are not strings. """ - self._hopsworks_udf = self._hopsworks_udf(*features) - return self + # Deep copy so that the same transformation function can be used to create multiple new transformation function with different features. + transformation = copy.deepcopy(self) + transformation._hopsworks_udf = transformation._hopsworks_udf(*features) + return transformation @classmethod def from_response_json( From 37a8b2388b765f8a504ed11609d7612ddb24743e Mon Sep 17 00:00:00 2001 From: manu-sj Date: Tue, 21 May 2024 02:02:41 +0200 Subject: [PATCH 32/58] sorting transformation function to maintain consistent order --- python/hsfs/feature_view.py | 22 ++++++++++++++++++++++ python/hsfs/transformation_function.py | 5 +++++ 2 files changed, 27 insertions(+) diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py index ad53317d75..a8e51c3b69 100644 --- a/python/hsfs/feature_view.py +++ b/python/hsfs/feature_view.py @@ -135,6 +135,11 @@ def __init__( else [] ) + if self._transformation_functions: + self._transformation_functions = FeatureView._sort_transformation_functions( + self._transformation_functions + ) + self._features = [] self._feature_view_engine: feature_view_engine.FeatureViewEngine = ( feature_view_engine.FeatureViewEngine(featurestore_id) @@ -378,6 +383,23 @@ def init_serving( self.query, serving_keys=self._serving_keys ) + @staticmethod + def _sort_transformation_functions( + transformation_functions: List[TransformationFunction], + ) -> List[TransformationFunction]: + """ + Function that sorts transformation functions in the order of the output column names. + + The list of transformation functions are sorted based on the output columns names to maintain consistent ordering. + + # Arguments + transformation_functions: `List[TransformationFunction]`. List of transformation functions to be sorted + + # Returns + `List[TransformationFunction]`: List of transformation functions to be sorted + """ + return sorted(transformation_functions, key=lambda x: x.output_column_names[0]) + def init_batch_scoring( self, training_dataset_version: Optional[int] = None, diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py index ce33a2b8d0..3267b4d14a 100644 --- a/python/hsfs/transformation_function.py +++ b/python/hsfs/transformation_function.py @@ -228,3 +228,8 @@ def version(self, version: int) -> None: def hopsworks_udf(self) -> HopsworksUdf: """Meta data class for the user defined transformation function.""" return self._hopsworks_udf + + @property + def output_column_names(self) -> List[str]: + """Output column names of transformation functions""" + return self._hopsworks_udf._output_column_names From eb77d70a701d1f243d7176ca28fd0fa723d84d58 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Tue, 21 May 2024 11:04:44 +0200 Subject: [PATCH 33/58] sorting transformation functions in transformation function engine to mainatin same order --- python/hsfs/core/transformation_function_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py index ddbaebe8e2..ec5de0810b 100644 --- a/python/hsfs/core/transformation_function_engine.py +++ b/python/hsfs/core/transformation_function_engine.py @@ -192,7 +192,7 @@ def get_ready_to_use_transformation_fns( transformation_function.hopsworks_udf.transformation_statistics = ( td_tffn_stats.feature_descriptive_statistics ) - return transformation_functions + return feature_view._sort_transformation_functions(transformation_functions) @staticmethod def compute_and_set_feature_statistics( From 68c95aa78c5ffc3ec78e8ad90381290623979a30 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Tue, 21 May 2024 15:03:01 +0200 Subject: [PATCH 34/58] using feature view transformation functions --- python/hsfs/engine/python.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py index e6d55a8238..e88a530c90 100644 --- a/python/hsfs/engine/python.py +++ b/python/hsfs/engine/python.py @@ -1022,7 +1022,7 @@ def _prepare_transform_split_df( # and the apply them for split_name in result_dfs: result_dfs[split_name] = self._apply_transformation_function( - training_dataset_obj.transformation_functions, + feature_view_obj.transformation_functions, result_dfs.get(split_name), ) From 88bff75417427e48491a14228dae92838eddc2cd Mon Sep 17 00:00:00 2001 From: manu-sj Date: Thu, 23 May 2024 06:23:17 +0200 Subject: [PATCH 35/58] addressing review comments --- python/hsfs/__init__.py | 9 ------- python/hsfs/builtin_transformations.py | 5 +++- python/hsfs/core/vector_server.py | 4 ++- python/hsfs/engine/python.py | 34 +++++++++++++++++++------- python/hsfs/feature_store.py | 2 +- python/hsfs/hopsworks_udf.py | 17 +++++++------ python/tests/engine/test_python.py | 9 ++++--- 7 files changed, 49 insertions(+), 31 deletions(-) diff --git a/python/hsfs/__init__.py b/python/hsfs/__init__.py index 82d368d243..31efe17c56 100644 --- a/python/hsfs/__init__.py +++ b/python/hsfs/__init__.py @@ -19,17 +19,8 @@ import warnings import nest_asyncio -from packaging.version import Version -try: - import pandas as pd - - if Version(pd.__version__) > Version("2.0"): - os.environ["USE_PYARROW_EXTENSION"] = "1" -except ImportError: - pass # Empty except block because environment variable "USE_PYARROW_EXTENSION" need not be set if pyarrow cannot be imported or if pandas version is less than 2.0 - # Setting polars skip cpu flag to suppress CPU false positive warning messages printed while importing hsfs os.environ["POLARS_SKIP_CPU_CHECK"] = "1" diff --git a/python/hsfs/builtin_transformations.py b/python/hsfs/builtin_transformations.py index 35a26d137e..d17126bd44 100644 --- a/python/hsfs/builtin_transformations.py +++ b/python/hsfs/builtin_transformations.py @@ -14,6 +14,7 @@ # limitations under the License. # +import numpy as np import pandas as pd from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics from hsfs.hopsworks_udf import hopsworks_udf @@ -50,7 +51,9 @@ def label_encoder( value for value in statistics_feature.extended_statistics["unique_values"] ] value_to_index = {value: index for index, value in enumerate(unique_data)} - return pd.Series([value_to_index[data] for data in feature]) + return pd.Series( + [value_to_index[data] if not pd.isna(data) else np.nan for data in feature] + ) @hopsworks_udf(bool) diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py index 8e25e18632..3a90387390 100755 --- a/python/hsfs/core/vector_server.py +++ b/python/hsfs/core/vector_server.py @@ -632,7 +632,9 @@ def apply_transformation(self, row_dict: dict): pd.Series(row_dict[feature]) for feature in tf.hopsworks_udf.transformation_features ] - transformed_result = tf.hopsworks_udf.get_udf()(*features) + transformed_result = tf.hopsworks_udf.get_udf(force_python_udf=True)( + *features + ) # Get only python compatible UDF irrespective of engine if isinstance(transformed_result, pd.Series): row_dict[transformed_result.name] = transformed_result.values[0] else: diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py index e88a530c90..e113015dc9 100644 --- a/python/hsfs/engine/python.py +++ b/python/hsfs/engine/python.py @@ -1011,14 +1011,15 @@ def _prepare_transform_split_df( training_dataset_obj, ) - if training_dataset_version is None: - transformation_function_engine.TransformationFunctionEngine.compute_and_set_feature_statistics( - training_dataset_obj, feature_view_obj, result_dfs - ) - else: - transformation_function_engine.TransformationFunctionEngine.get_and_set_feature_statistics( - training_dataset_obj, feature_view_obj, training_dataset_version - ) + # TODO : Currently statistics always computed since in memory training dataset retrieved is not consistent + # if training_dataset_version is None: + transformation_function_engine.TransformationFunctionEngine.compute_and_set_feature_statistics( + training_dataset_obj, feature_view_obj, result_dfs + ) + # else: + # transformation_function_engine.TransformationFunctionEngine.get_and_set_feature_statistics( + # training_dataset_obj, feature_view_obj, training_dataset_version + # ) # and the apply them for split_name in result_dfs: result_dfs[split_name] = self._apply_transformation_function( @@ -1290,6 +1291,21 @@ def add_file(self, file: Optional[str]) -> Optional[str]: f.write(bytesio_object.getbuffer()) return local_file + def _check_pyarrow_extension(self): + """ + Function to check if pyarrow extension should be used for copying polars dataframe to pandas + """ + try: + import pandas as pd + from packaging.version import Version + + if Version(pd.__version__) > Version("2.0"): + return True + else: + return False + except Exception: + return False # Return false if pyarrow or pandas cannot be imported + def _apply_transformation_function( self, transformation_functions: List[transformation_function.TransformationFunction], @@ -1312,7 +1328,7 @@ def _apply_transformation_function( dataset, pl.dataframe.frame.DataFrame ): # Converting polars dataframe to pandas because currently we support only pandas UDF's as transformation functions. - if os.getenv("USE_PYARROW_EXTENSION", False): + if self._check_pyarrow_extension(): dataset = dataset.to_pandas( use_pyarrow_extension_array=True ) # Zero copy if pyarrow extension can be used. diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py index 848252cb64..5400d5d08d 100644 --- a/python/hsfs/feature_store.py +++ b/python/hsfs/feature_store.py @@ -1278,7 +1278,7 @@ def create_training_dataset( @usage.method_logger def create_transformation_function( self, - transformation_function: callable, + transformation_function: HopsworksUdf, version: Optional[int] = None, ) -> "TransformationFunction": """Create a transformation function metadata object. diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index 0091e50481..328ac3c091 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -150,10 +150,10 @@ def __init__( ) ) - self._output_column_names: List[str] = self._get_output_column_names() - self._statistics: Optional[Dict[str, FeatureDescriptiveStatistics]] = None + self._output_column_names: List[str] = self._get_output_column_names() + @staticmethod def _validate_and_convert_output_types( output_types: Union[List[type], List[str]], @@ -496,21 +496,21 @@ def update_return_type_one_hot(self): ] self.output_column_names = self._get_output_column_names() - def get_udf(self) -> Callable: + def get_udf(self, force_python_udf: bool = False) -> Callable: """ Function that checks the current engine type and returns the appropriate UDF. In the spark engine the UDF is returned as a pandas UDF. While in the python engine the UDF is returned as python function. + # Arguments + force_python_udf: `bool`. Force return a python compatible udf irrespective of engine. + # Returns `Callable`: Pandas UDF in the spark engine otherwise returns a python function for the UDF. """ - # Update the number of outputs for one hot encoder to match the number of unique values for the feature - if self.function_name == "one_hot_encoder": - self.update_return_type_one_hot() - if engine.get_type() in ["hive", "python", "training"]: + if engine.get_type() in ["hive", "python", "training"] or force_python_udf: return self.hopsworksUdf_wrapper() else: from pyspark.sql.functions import pandas_udf @@ -581,6 +581,9 @@ def from_response_json( @property def output_types(self) -> List[str]: """Get the output types of the UDF""" + # Update the number of outputs for one hot encoder to match the number of unique values for the feature + if self.function_name == "one_hot_encoder" and self.transformation_statistics: + self.update_return_type_one_hot() return self._output_types @property diff --git a/python/tests/engine/test_python.py b/python/tests/engine/test_python.py index 55267cc7ce..4b883f8ed2 100644 --- a/python/tests/engine/test_python.py +++ b/python/tests/engine/test_python.py @@ -2477,6 +2477,7 @@ def test_prepare_transform_split_df_random_split(self, mocker): mocker.patch( "hsfs.core.transformation_function_engine.TransformationFunctionEngine" ) + mock_feature_view = mocker.patch("hsfs.feature_view.FeatureView") python_engine = python.Engine() @@ -2504,7 +2505,7 @@ def test_prepare_transform_split_df_random_split(self, mocker): result = python_engine._prepare_transform_split_df( query_obj=q, training_dataset_obj=td, - feature_view_obj=None, + feature_view_obj=mock_feature_view, read_option=None, dataframe_type="default", ) @@ -2525,6 +2526,7 @@ def test_prepare_transform_split_df_time_split_td_features(self, mocker): mocker.patch( "hsfs.core.transformation_function_engine.TransformationFunctionEngine" ) + mock_feature_view = mocker.patch("hsfs.feature_view.FeatureView") python_engine = python.Engine() @@ -2570,7 +2572,7 @@ def test_prepare_transform_split_df_time_split_td_features(self, mocker): result = python_engine._prepare_transform_split_df( query_obj=q, training_dataset_obj=td, - feature_view_obj=None, + feature_view_obj=mock_feature_view, read_option=None, dataframe_type="default", ) @@ -2591,6 +2593,7 @@ def test_prepare_transform_split_df_time_split_query_features(self, mocker): mocker.patch( "hsfs.core.transformation_function_engine.TransformationFunctionEngine" ) + mock_feature_view = mocker.patch("hsfs.feature_view.FeatureView") python_engine = python.Engine() @@ -2635,7 +2638,7 @@ def test_prepare_transform_split_df_time_split_query_features(self, mocker): result = python_engine._prepare_transform_split_df( query_obj=q, training_dataset_obj=td, - feature_view_obj=None, + feature_view_obj=mock_feature_view, read_option=None, dataframe_type="default", ) From 5ea3e43da150d71c1b764def668059f0409759a3 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Mon, 27 May 2024 09:36:47 +0200 Subject: [PATCH 36/58] using PYARROW_EXTENSION_ENABLE during import rather than as a function --- python/hsfs/engine/python.py | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py index e113015dc9..cc50428632 100644 --- a/python/hsfs/engine/python.py +++ b/python/hsfs/engine/python.py @@ -102,6 +102,18 @@ except ImportError: pass +PYARROW_EXTENSION_ENABLE = False +try: + import pandas as pd + from packaging.version import Version + + if Version(pd.__version__) > Version("2.0"): + PYARROW_EXTENSION_ENABLE = True + else: + PYARROW_EXTENSION_ENABLE = False +except Exception: + PYARROW_EXTENSION_ENABLE = False # Set PYARROW_EXTENSION_ENABLE to false if pyarrow or pandas cannot be imported + # Decimal types are currently not supported _INT_TYPES = [pa.uint8(), pa.uint16(), pa.int8(), pa.int16(), pa.int32()] _BIG_INT_TYPES = [pa.uint32(), pa.int64()] @@ -1291,21 +1303,6 @@ def add_file(self, file: Optional[str]) -> Optional[str]: f.write(bytesio_object.getbuffer()) return local_file - def _check_pyarrow_extension(self): - """ - Function to check if pyarrow extension should be used for copying polars dataframe to pandas - """ - try: - import pandas as pd - from packaging.version import Version - - if Version(pd.__version__) > Version("2.0"): - return True - else: - return False - except Exception: - return False # Return false if pyarrow or pandas cannot be imported - def _apply_transformation_function( self, transformation_functions: List[transformation_function.TransformationFunction], @@ -1328,7 +1325,7 @@ def _apply_transformation_function( dataset, pl.dataframe.frame.DataFrame ): # Converting polars dataframe to pandas because currently we support only pandas UDF's as transformation functions. - if self._check_pyarrow_extension(): + if PYARROW_EXTENSION_ENABLE: dataset = dataset.to_pandas( use_pyarrow_extension_array=True ) # Zero copy if pyarrow extension can be used. From 58678bc409c32ef6948edf897a383ea341301639 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Mon, 27 May 2024 13:15:00 +0200 Subject: [PATCH 37/58] skiping transformation function test in windows spark udf failing due to dependencies with greater expectation --- .../engine/test_python_spark_transformation_functions.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/tests/engine/test_python_spark_transformation_functions.py b/python/tests/engine/test_python_spark_transformation_functions.py index f5763ea548..4929312bec 100644 --- a/python/tests/engine/test_python_spark_transformation_functions.py +++ b/python/tests/engine/test_python_spark_transformation_functions.py @@ -16,6 +16,7 @@ from __future__ import annotations import datetime +import os import statistics import pandas as pd @@ -44,6 +45,11 @@ ) +# TODO : Remove skipping UT in windows after Greater expectations has been upgraded to 1.0 or after it has been made optional +@pytest.mark.skipif( + os.name == "nt", + reason="Skip tests in windows since it fails due to dependency problem with greater expectations 0.18.2, Fixed on upgrading to 1.0", +) class TestPythonSparkTransformationFunctions: def _create_training_dataset(self): f = training_dataset_feature.TrainingDatasetFeature( From be5036b85b7c20a6ccea5076b4f0aa5a8604d066 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Mon, 27 May 2024 16:45:24 +0200 Subject: [PATCH 38/58] changing transformed_feature_vector_col_name to transformed_features to obtain feature names after transfromations --- python/hsfs/core/vector_server.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py index 3a90387390..118ecca5e7 100755 --- a/python/hsfs/core/vector_server.py +++ b/python/hsfs/core/vector_server.py @@ -31,25 +31,26 @@ client, feature_view, training_dataset, + transformation_function, ) from hsfs import ( serving_key as sk_mod, ) -from hsfs import ( - training_dataset_feature as tdf_mod, -) +from hsfs import training_dataset_feature as tdf_mod from hsfs.client import exceptions, online_store_rest_client from hsfs.core import ( online_store_rest_client_engine, online_store_sql_engine, +) +from hsfs.core import ( transformation_function_engine as tf_engine_mod, - transformation_function ) HAS_FASTAVRO = False try: from fastavro import schemaless_reader + HAS_FASTAVRO = True except ImportError: from avro.io import BinaryDecoder @@ -106,8 +107,9 @@ def __init__( self._transformation_function_engine = ( tf_engine_mod.TransformationFunctionEngine(feature_store_id) ) - self._transformation_functions: List[transformation_function.TransformationFunction] = [] - + self._transformation_functions: List[ + transformation_function.TransformationFunction + ] = [] self._sql_client = None self._rest_client_engine = None @@ -429,7 +431,7 @@ def assemble_feature_vector( _logger.debug("Assembled and transformed dict feature vector: %s", result_dict) - return [result_dict.get(fname, None) for fname in self.transformed_feature_vector_col_name] + return [result_dict.get(fname, None) for fname in self.transformed_features] def handle_feature_vector_return_type( self, @@ -563,7 +565,7 @@ def get_inference_helpers( return self.handle_feature_vector_return_type( batch_results, batch=True, inference_helper=True, return_type=return_type ) - + def which_client_and_ensure_initialised( self, force_rest_client: bool, force_sql_client: bool ) -> str: @@ -626,7 +628,7 @@ def _set_default_client( self._init_sql_client = True def apply_transformation(self, row_dict: dict): - _logger.debug("Applying transformation functions to : %s", matching_keys) + _logger.debug("Applying transformation functions.") for tf in self.transformation_functions: features = [ pd.Series(row_dict[feature]) @@ -1060,7 +1062,7 @@ def default_client(self, default_client: Literal["rest", "sql"]): _logger.debug(f"Default Online Store Client is set to {default_client}.") self._default_client = default_client - def transformed_feature_vector_col_name(self): + def transformed_features(self): if self._transformed_feature_vector_col_name is None: transformation_features = [] output_column_names = [] From 3a01eadb0e4bad5257bc56c53f4ac1c7e467b0a9 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Mon, 27 May 2024 17:21:14 +0200 Subject: [PATCH 39/58] adding property transformed_features in feature view to obtain feature names after transfromations --- python/hsfs/core/vector_server.py | 7 +++++-- python/hsfs/feature_view.py | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py index 118ecca5e7..5a344db890 100755 --- a/python/hsfs/core/vector_server.py +++ b/python/hsfs/core/vector_server.py @@ -431,7 +431,10 @@ def assemble_feature_vector( _logger.debug("Assembled and transformed dict feature vector: %s", result_dict) - return [result_dict.get(fname, None) for fname in self.transformed_features] + return [ + result_dict.get(fname, None) + for fname in self.transformed_feature_vector_col_name + ] def handle_feature_vector_return_type( self, @@ -1062,7 +1065,7 @@ def default_client(self, default_client: Literal["rest", "sql"]): _logger.debug(f"Default Online Store Client is set to {default_client}.") self._default_client = default_client - def transformed_features(self): + def transformed_feature_vector_col_name(self): if self._transformed_feature_vector_col_name is None: transformation_features = [] output_column_names = [] diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py index a8e51c3b69..9ca317a473 100644 --- a/python/hsfs/feature_view.py +++ b/python/hsfs/feature_view.py @@ -3709,3 +3709,18 @@ def serving_keys(self) -> List[skm.ServingKey]: @serving_keys.setter def serving_keys(self, serving_keys: List[skm.ServingKey]) -> None: self._serving_keys = serving_keys + + @property + def transformed_features(self) -> List[str]: + """Name of features of a feature view after transformation functions have been applied""" + transformation_features = set() + transformed_column_names = [] + for tf in self.transformation_functions: + transformed_column_names.extend(tf.output_column_names) + transformation_features.update(tf.hopsworks_udf.transformation_features) + + return [ + feature.name + for feature in self.features + if feature.name not in transformation_features + ] + transformed_column_names From 2753ec47328d87d2fb1ec5f633d9cf342470eb12 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Tue, 28 May 2024 15:15:41 +0200 Subject: [PATCH 40/58] updating doc string and adding property decorator missed during rebase --- python/hsfs/core/vector_server.py | 1 + python/hsfs/hopsworks_udf.py | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py index 5a344db890..9a882523b6 100755 --- a/python/hsfs/core/vector_server.py +++ b/python/hsfs/core/vector_server.py @@ -1065,6 +1065,7 @@ def default_client(self, default_client: Literal["rest", "sql"]): _logger.debug(f"Default Online Store Client is set to {default_client}.") self._default_client = default_client + @property def transformed_feature_vector_col_name(self): if self._transformed_feature_vector_col_name is None: transformation_features = [] diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index 328ac3c091..246483c9b4 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -35,9 +35,10 @@ def hopsworks_udf(output_type: Union[List[type], type]) -> "HopsworksUdf": Create an User Defined Function that can be and used within the Hopsworks Feature Store. Hopsworks UDF's are user defined functions that executes as 'pandas_udf' when executing - in spark engine and as pandas functions in the python engine. A Hopsworks udf is defined - using the `hopsworks_udf` decorator. The outputs of the defined UDF must be mentioned in the - decorator as a list of python types. + in spark engine and as pandas functions in the python engine. The pandas udf/pandas functions + gets as inputs pandas Series's and can provide as output a pandas Series or a pandas DataFrame. + A Hopsworks udf is defined using the `hopsworks_udf` decorator. The outputs of the defined UDF + must be mentioned in the decorator as a list of python types. !!! example From 23c7b8a474968ac97977bb0c106c23c00ebacc78 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Fri, 7 Jun 2024 09:41:10 +0200 Subject: [PATCH 41/58] refactoring transformation functions to update parsing of statistics parameters and also renaming decorator name --- python/hsfs/builtin_transformations.py | 49 ++--- python/hsfs/engine/spark.py | 2 +- python/hsfs/feature_store.py | 4 +- python/hsfs/hopsworks_udf.py | 193 +++++++++++------- python/hsfs/transformation_function.py | 4 +- python/hsfs/transformation_statistics.py | 119 +++++++++++ python/tests/core/test_feature_view_engine.py | 8 +- .../core/test_training_dataset_engine.py | 4 +- .../test_transformation_function_engine.py | 29 +-- python/tests/engine/test_python.py | 15 +- ...t_python_spark_transformation_functions.py | 54 +++-- python/tests/engine/test_spark.py | 12 +- .../tests/fixtures/feature_view_fixtures.json | 14 +- .../transformation_function_fixtures.json | 29 +-- python/tests/test_feature_view.py | 8 +- .../transformation_test_helper.py | 45 ++-- python/tests/test_hopswork_udf.py | 162 ++++++--------- python/tests/test_transformation_function.py | 39 ++-- 18 files changed, 469 insertions(+), 321 deletions(-) create mode 100644 python/hsfs/transformation_statistics.py diff --git a/python/hsfs/builtin_transformations.py b/python/hsfs/builtin_transformations.py index d17126bd44..421a04cffe 100644 --- a/python/hsfs/builtin_transformations.py +++ b/python/hsfs/builtin_transformations.py @@ -16,39 +16,36 @@ import numpy as np import pandas as pd -from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics -from hsfs.hopsworks_udf import hopsworks_udf +from hsfs.hopsworks_udf import udf +from hsfs.transformation_statistics import TransformationStatistics -@hopsworks_udf(float) -def min_max_scaler(feature: pd.Series, statistics_feature) -> pd.Series: - return (feature - statistics_feature.min) / ( - statistics_feature.max - statistics_feature.min +feature_statistics = TransformationStatistics("feature") + + +@udf(float) +def min_max_scaler(feature: pd.Series, statistics=feature_statistics) -> pd.Series: + return (feature - statistics.feature.min) / ( + statistics.feature.max - statistics.feature.min ) -@hopsworks_udf(float) -def standard_scaler( - feature: pd.Series, statistics_feature: FeatureDescriptiveStatistics -) -> pd.Series: - return (feature - statistics_feature.mean) / statistics_feature.stddev +@udf(float) +def standard_scaler(feature: pd.Series, statistics=feature_statistics) -> pd.Series: + return (feature - statistics.feature.mean) / statistics.feature.stddev -@hopsworks_udf(float) -def robust_scaler( - feature: pd.Series, statistics_feature: FeatureDescriptiveStatistics -) -> pd.Series: - return (feature - statistics_feature.percentiles[49]) / ( - statistics_feature.percentiles[74] - statistics_feature.percentiles[24] +@udf(float) +def robust_scaler(feature: pd.Series, statistics=feature_statistics) -> pd.Series: + return (feature - statistics.feature.percentiles[49]) / ( + statistics.feature.percentiles[74] - statistics.feature.percentiles[24] ) -@hopsworks_udf(int) -def label_encoder( - feature: pd.Series, statistics_feature: FeatureDescriptiveStatistics -) -> pd.Series: +@udf(int) +def label_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Series: unique_data = [ - value for value in statistics_feature.extended_statistics["unique_values"] + value for value in statistics.feature.extended_statistics["unique_values"] ] value_to_index = {value: index for index, value in enumerate(unique_data)} return pd.Series( @@ -56,12 +53,10 @@ def label_encoder( ) -@hopsworks_udf(bool) -def one_hot_encoder( - feature: pd.Series, statistics_feature: FeatureDescriptiveStatistics -) -> pd.Series: +@udf(bool) +def one_hot_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Series: unique_data = [ - value for value in statistics_feature.extended_statistics["unique_values"] + value for value in statistics.feature.extended_statistics["unique_values"] ] one_hot = pd.get_dummies(feature, dtype="bool") for data in unique_data: diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py index c462efa641..a22be38cc0 100644 --- a/python/hsfs/engine/spark.py +++ b/python/hsfs/engine/spark.py @@ -1269,7 +1269,7 @@ def _apply_transformation_function( output_col_names.append(output_col_name) transformation_features.append(hopsworks_udf.transformation_features) - if len(hopsworks_udf.output_types) > 1: + if len(hopsworks_udf.return_types) > 1: explode_name.append(f"{output_col_name}.*") else: explode_name.append(output_col_name) diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py index 5400d5d08d..11eeac1983 100644 --- a/python/hsfs/feature_store.py +++ b/python/hsfs/feature_store.py @@ -1286,7 +1286,7 @@ def create_transformation_function( !!! example ```python # define the transformation function as a Hopsworks's UDF - @hopsworks_udf(int) + @udf(int) def plus_one(value): return value + 1 @@ -1464,7 +1464,7 @@ def create_feature_view( query = fg1.select_all().join(fg2.select_all()) # define the transformation function as a Hopsworks's UDF - @hopsworks_udf(int) + @udf(int) def plus_one(value): return value + 1 diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index 246483c9b4..e287089545 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -28,9 +28,10 @@ from hsfs.client.exceptions import FeatureStoreException from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics from hsfs.decorators import typechecked +from hsfs.transformation_statistics import TransformationStatistics -def hopsworks_udf(output_type: Union[List[type], type]) -> "HopsworksUdf": +def udf(return_type: Union[List[type], type]) -> "HopsworksUdf": """ Create an User Defined Function that can be and used within the Hopsworks Feature Store. @@ -43,15 +44,15 @@ def hopsworks_udf(output_type: Union[List[type], type]) -> "HopsworksUdf": !!! example ```python - from hsfs.hopsworks_udf import hopsworks_udf + from hsfs.hopsworks_udf import udf - @hopsworks_udf(float) + @udf(float) def add_one(data1 : pd.Series): return data1 + 1 ``` # Arguments - output_type: `list`. The output types of the defined UDF + return_type: `list`. The output types of the defined UDF # Returns `HopsworksUdf`: The metadata object for hopsworks UDF's. @@ -61,7 +62,7 @@ def add_one(data1 : pd.Series): """ def wrapper(func: Callable) -> HopsworksUdf: - udf = HopsworksUdf(func=func, output_types=output_type) + udf = HopsworksUdf(func=func, return_types=return_type) return udf return wrapper @@ -123,12 +124,12 @@ class HopsworksUdf: def __init__( self, func: Union[Callable, str], - output_types: Union[List[type], type, List[str], str], + return_types: Union[List[type], type, List[str], str], name: Optional[str] = None, transformation_features: Optional[List[TransformationFeature]] = None, ): - self._output_types: List[str] = HopsworksUdf._validate_and_convert_output_types( - output_types + self._return_types: List[str] = HopsworksUdf._validate_and_convert_output_types( + return_types ) self._function_name: str = func.__name__ if name is None else name @@ -138,20 +139,20 @@ def __init__( if isinstance(func, Callable) else func ) - - self._transformation_features: List[TransformationFeature] = ( - HopsworksUdf._extract_function_arguments(self._function_source) - if not transformation_features - else transformation_features - ) + if not transformation_features: + self._transformation_features: List[TransformationFeature] = ( + HopsworksUdf._extract_function_arguments(func) + if not transformation_features + else transformation_features + ) + else: + self._transformation_features = transformation_features self._formatted_function_source, self._module_imports = ( - HopsworksUdf._format_source_code( - self._function_source, self._transformation_features - ) + HopsworksUdf._format_source_code(self._function_source) ) - self._statistics: Optional[Dict[str, FeatureDescriptiveStatistics]] = None + self._statistics: Optional[TransformationStatistics] = None self._output_column_names: List[str] = self._get_output_column_names() @@ -279,63 +280,67 @@ def _parse_function_signature(source_code: str) -> Tuple[List[str], str, int, in ] ) arg_list = signature.split("(")[1].split(")")[0].split(",") - arg_list = [arg for arg in arg_list if not arg.strip() == ""] + arg_list = [arg.split(":")[0].split("=")[0].strip() for arg in arg_list] + if "statistics" in arg_list: + arg_list.remove("statistics") return arg_list, signature, signature_start_line, signature_end_line @staticmethod - def _extract_function_arguments(source_code: str) -> List[TransformationFeature]: + def _extract_function_arguments(function: Callable) -> List[TransformationFeature]: """ Function to extract the argument names from a provided function source code. # Arguments - source_code: `str`. Source code of a function. + source_code: `Callable`. The function for which the value are to be extracted. # Returns `List[TransformationFeature]`: List of TransformationFeature that provide a mapping from feature names to corresponding statistics parameters if any is present. """ - # Get source code of the original function - arg_list, _, _, _ = HopsworksUdf._parse_function_signature(source_code) - - if arg_list == []: + arg_list = [] + statistics = None + signature = inspect.signature(function).parameters + if not signature: raise FeatureStoreException( "No arguments present in the provided user defined function. Please provide at least one argument in the defined user defined function." ) + for arg in inspect.signature(function).parameters.values(): + if arg.name == "statistics": + statistics = arg.default + else: + arg_list.append(arg.name) - arg_list = [arg.split(":")[0].strip() for arg in arg_list] - - for arg in arg_list: - if arg.startswith("statistics"): - if arg.split("statistics_")[1] not in arg_list: - raise FeatureStoreException( - f"No argument corresponding to statistics parameter '{arg}' present in function definition." - ) - - return [ - TransformationFeature( - arg, f"statistics_{arg}" if f"statistics_{arg}" in arg_list else None - ) - for arg in arg_list - if not arg.startswith("statistics") - ] + if statistics: + missing_statistic_features = [ + statistic_feature + for statistic_feature in statistics._features + if statistic_feature not in arg_list + ] + if missing_statistic_features: + missing_statistic_features = "', '".join(missing_statistic_features) + raise FeatureStoreException( + f"No argument corresponding to statistics parameter '{missing_statistic_features}' present in function definition." + ) + return [ + TransformationFeature(arg, arg if arg in statistics._features else None) + for arg in arg_list + ] + else: + return [TransformationFeature(arg, None) for arg in arg_list] @staticmethod - def _format_source_code( - source_code: str, transformation_features: List[TransformationFeature] - ) -> Tuple[str, str]: + def _format_source_code(source_code: str) -> Tuple[str, str]: """ Function that parses the existing source code to remove statistics parameter and remove all decorators and type hints from the function source code. # Arguments source_code: `str`. Source code of a function. - transformation_features `List[TransformationFeature]`: List of transformation features provided in the function argument. # Returns `Tuple[str, str]`: Tuple that contains Source code that does not contain any decorators, type hints or statistics parameters and the module imports """ - _, signature, _, signature_end_line = HopsworksUdf._parse_function_signature( - source_code + arg_list, signature, _, signature_end_line = ( + HopsworksUdf._parse_function_signature(source_code) ) module_imports = source_code.split("@")[0] - arg_list = [feature.feature_name for feature in transformation_features] # Reconstruct the function signature new_signature = ( @@ -359,8 +364,8 @@ def _get_output_column_names(self) -> str: _BASE_COLUMN_NAME = ( f'{self.function_name}_{"-".join(self.transformation_features)}_' ) - if len(self.output_types) > 1: - return [f"{_BASE_COLUMN_NAME}{i}" for i in range(len(self.output_types))] + if len(self.return_types) > 1: + return [f"{_BASE_COLUMN_NAME}{i}" for i in range(len(self.return_types))] else: return [f"{_BASE_COLUMN_NAME}"] @@ -371,15 +376,15 @@ def _create_pandas_udf_return_schema_from_list(self) -> str: # Returns `str`: DDL-formatted type string that denotes the return types of the user defined function. """ - if len(self.output_types) > 1: + if len(self.return_types) > 1: return ", ".join( [ - f"`{self.output_column_names[i]}` {self.output_types[i]}" - for i in range(len(self.output_types)) + f"`{self.output_column_names[i]}` {self.return_types[i]}" + for i in range(len(self.return_types)) ] ) else: - return self.output_types[0] + return self.return_types[0] def hopsworksUdf_wrapper(self) -> Callable: """ @@ -405,7 +410,7 @@ def hopsworksUdf_wrapper(self) -> Callable: return date_time_col.dt.tz_localize(None).dt.tz_localize(str(current_timezone))""" # Defining wrapper function that renames the column names to specific names - if len(self.output_types) > 1: + if len(self.return_types) > 1: code = ( self._module_imports + "\n" @@ -440,7 +445,7 @@ def renaming_wrapper(*args): # Shallow copy of scope performed because updating statistics argument of scope must not affect other instances. scope = __import__("__main__").__dict__.copy() if self.transformation_statistics is not None: - scope.update(self.transformation_statistics) + scope.update({"statistics": self.transformation_statistics}) scope.update({"_output_col_names": self.output_column_names}) # executing code exec(code, scope) @@ -485,8 +490,8 @@ def __call__(self, *features: List[str]) -> "HopsworksUdf": return udf def update_return_type_one_hot(self): - self._output_types = [ - self._output_types[0] + self._return_types = [ + self._return_types[0] for _ in range( len( self.transformation_statistics[ @@ -530,8 +535,11 @@ def to_dict(self) -> Dict[str, Any]: """ return { "sourceCode": self._function_source, - "outputTypes": self.output_types, + "outputTypes": self.return_types, "transformationFeatures": self.transformation_features, + "statisticsArgumentNames": self._statistics_argument_names + if self.statistics_required + else None, "name": self._function_name, } @@ -568,24 +576,51 @@ def from_response_json( feature.strip() for feature in json_decamelized["transformation_features"].split(",") ] + statistics_features = ( + [ + feature.strip() + for feature in json_decamelized["statistics_argument_names"].split(",") + ] + if "statistics_argument_names" in json_decamelized + else None + ) + + # Reconstructing statistics arguments. + arg_list, _, _, _ = HopsworksUdf._parse_function_signature(function_source_code) + + if statistics_features: + transformation_features = [ + TransformationFeature( + transformation_features[arg_index], + arg_list[arg_index] + if arg_list[arg_index] in statistics_features + else None, + ) + for arg_index in range(len(arg_list)) + ] + else: + transformation_features = [ + TransformationFeature(transformation_features[arg_index], None) + for arg_index in range(len(arg_list)) + ] hopsworks_udf = cls( - func=function_source_code, output_types=output_types, name=function_name + func=function_source_code, + return_types=output_types, + name=function_name, + transformation_features=transformation_features, ) # Set transformation features if already set. - if "" not in transformation_features: - return hopsworks_udf(*transformation_features) - else: - return hopsworks_udf + return hopsworks_udf @property - def output_types(self) -> List[str]: + def return_types(self) -> List[str]: """Get the output types of the UDF""" # Update the number of outputs for one hot encoder to match the number of unique values for the feature if self.function_name == "one_hot_encoder" and self.transformation_statistics: self.update_return_type_one_hot() - return self._output_types + return self._return_types @property def function_name(self) -> str: @@ -600,7 +635,7 @@ def statistics_required(self) -> bool: @property def transformation_statistics( self, - ) -> Optional[Dict[str, FeatureDescriptiveStatistics]]: + ) -> Optional[TransformationStatistics]: """Feature statistics required for the defined UDF""" return self._statistics @@ -640,24 +675,34 @@ def _statistics_argument_mapping(self) -> Dict[str, str]: for transformation_feature in self._transformation_features } + @property + def _statistics_argument_names(self) -> List[str]: + """ + list of argument names required for statistics + """ + return [ + transformation_feature.statistic_argument_name + for transformation_feature in self._transformation_features + if transformation_feature.statistic_argument_name is not None + ] + @transformation_statistics.setter def transformation_statistics( self, statistics: List[FeatureDescriptiveStatistics] ) -> None: - self._statistics = dict() + self._statistics = TransformationStatistics(*self._statistics_argument_names) for stat in statistics: - if stat.feature_name in self._statistics_argument_mapping.keys(): - self._statistics[ - self._statistics_argument_mapping[stat.feature_name] - ] = stat + self._statistics.set_statistics( + self._statistics_argument_mapping[stat.feature_name], stat.to_dict() + ) @output_column_names.setter def output_column_names(self, output_col_names: Union[str, List[str]]) -> None: if not isinstance(output_col_names, List): output_col_names = [output_col_names] - if len(output_col_names) != len(self.output_types): + if len(output_col_names) != len(self.return_types): raise FeatureStoreException( - f"Provided names for output columns does not match the number of columns returned from the UDF. Please provide {len(self.output_types)} names." + f"Provided names for output columns does not match the number of columns returned from the UDF. Please provide {len(self.return_types)} names." ) else: self._output_column_names = output_col_names diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py index 3267b4d14a..a3f6a295d7 100644 --- a/python/hsfs/transformation_function.py +++ b/python/hsfs/transformation_function.py @@ -74,7 +74,7 @@ def save(self) -> None: # import hopsworks udf decorator from hsfs.hopsworks_udf import HopsworksUdf # define function - @hopsworks_udf(int) + @udf(int) def plus_one(value): return value + 1 @@ -98,7 +98,7 @@ def delete(self) -> None: # import hopsworks udf decorator from hsfs.hopsworks_udf import HopsworksUdf # define function - @hopsworks_udf(int) + @udf(int) def plus_one(value): return value + 1 diff --git a/python/hsfs/transformation_statistics.py b/python/hsfs/transformation_statistics.py new file mode 100644 index 0000000000..f4b6b1c0e5 --- /dev/null +++ b/python/hsfs/transformation_statistics.py @@ -0,0 +1,119 @@ +# +# Copyright 2024 Hopsworks AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Dict, Mapping, Optional, Union + +import humps + + +@dataclass +class FeatureTransformationStatistics: + """ + Data class that contains all the statistics parameters that can be used for transformations. + """ + + feature_name: str + count: int = None + # for any feature type + completeness: Optional[float] = None + num_non_null_values: Optional[int] = None + num_null_values: Optional[int] = None + approx_num_distinct_values: Optional[int] = None + # for numerical features + min: Optional[float] = None + max: Optional[float] = None + sum: Optional[float] = None + mean: Optional[float] = None + stddev: Optional[float] = None + percentiles: Optional[Mapping[str, float]] = None + # with exact uniqueness + distinctness: Optional[float] = None + entropy: Optional[float] = None + uniqueness: Optional[float] = None + exact_num_distinct_values: Optional[int] = None + extended_statistics: Optional[Union[dict, str]] = None + + def __init__( + self, + feature_name: str, + count: int = None, + completeness: Optional[float] = None, + num_non_null_values: Optional[int] = None, + num_null_values: Optional[int] = None, + approx_num_distinct_values: Optional[int] = None, + min: Optional[float] = None, + max: Optional[float] = None, + sum: Optional[float] = None, + mean: Optional[float] = None, + stddev: Optional[float] = None, + percentiles: Optional[Mapping[str, float]] = None, + distinctness: Optional[float] = None, + entropy: Optional[float] = None, + uniqueness: Optional[float] = None, + exact_num_distinct_values: Optional[int] = None, + extended_statistics: Optional[Union[dict, str]] = None, + **kwargs, + ): + self.feature_name = feature_name + self.count = count + self.completeness = completeness + self.num_non_null_values = num_non_null_values + self.num_null_values = num_null_values + self.approx_num_distinct_values = approx_num_distinct_values + self.min = min + self.max = max + self.sum = sum + self.mean = mean + self.stddev = stddev + self.percentiles = percentiles + self.distinctness = distinctness + self.entropy = entropy + self.uniqueness = uniqueness + self.exact_num_distinct_values = exact_num_distinct_values + self.extended_statistics = extended_statistics + + @classmethod + def from_response_json( + cls: FeatureTransformationStatistics, json_dict: Dict[str, Any] + ): + json_decamelized = humps.decamelize(json_dict) + return cls(**json_decamelized) + + +class TransformationStatistics: + """ + Class that stores statistics of all features required for a transformation function. + """ + + def __init__(self, *features): + self._features = features + self.__dict__.update( + {feature: self.init_statistics(feature) for feature in features} + ) + + def init_statistics(self, feature_name): + return FeatureTransformationStatistics(feature_name=feature_name) + + def set_statistics(self, feature_name, statistics: Dict[str, Any]): + self.__dict__[feature_name] = ( + FeatureTransformationStatistics.from_response_json(statistics) + ) + + def __repr__(self) -> str: + return ",\n ".join([repr(self.__dict__[feature]) for feature in self._features]) diff --git a/python/tests/core/test_feature_view_engine.py b/python/tests/core/test_feature_view_engine.py index b1fb7ee08a..f6a141fb20 100644 --- a/python/tests/core/test_feature_view_engine.py +++ b/python/tests/core/test_feature_view_engine.py @@ -29,7 +29,7 @@ from hsfs.constructor.query import Query from hsfs.core import arrow_flight_client, feature_view_engine from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics -from hsfs.hopsworks_udf import hopsworks_udf +from hsfs.hopsworks_udf import udf from hsfs.storage_connector import BigQueryConnector, StorageConnector from hsfs.transformation_function import TransformationFunction @@ -565,7 +565,7 @@ def test_get_attached_transformation_fn(self, mocker): feature_store_id=feature_store_id ) - @hopsworks_udf(int) + @udf(int) def test2(col1): return col1 + 1 @@ -593,7 +593,7 @@ def test_get_attached_transformation_fn_multiple(self, mocker): feature_store_id=feature_store_id ) - @hopsworks_udf(int) + @udf(int) def test1(col1): return col1 + 1 @@ -602,7 +602,7 @@ def test1(col1): hopsworks_udf=test1, ) - @hopsworks_udf(int) + @udf(int) def test2(col1): return col1 + 2 diff --git a/python/tests/core/test_training_dataset_engine.py b/python/tests/core/test_training_dataset_engine.py index 5e77445971..fea3d43f88 100644 --- a/python/tests/core/test_training_dataset_engine.py +++ b/python/tests/core/test_training_dataset_engine.py @@ -23,7 +23,7 @@ ) from hsfs.constructor import query from hsfs.core import training_dataset_engine -from hsfs.hopsworks_udf import hopsworks_udf +from hsfs.hopsworks_udf import udf class TestTrainingDatasetEngine: @@ -118,7 +118,7 @@ def test_save_transformation_functions(self, mocker): mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") mock_td_api = mocker.patch("hsfs.core.training_dataset_api.TrainingDatasetApi") - @hopsworks_udf(int) + @udf(int) def plus_one(a): return a + 1 diff --git a/python/tests/core/test_transformation_function_engine.py b/python/tests/core/test_transformation_function_engine.py index 51dd623ef1..11cd593cc3 100644 --- a/python/tests/core/test_transformation_function_engine.py +++ b/python/tests/core/test_transformation_function_engine.py @@ -24,7 +24,7 @@ transformation_function, ) from hsfs.core import transformation_function_engine -from hsfs.hopsworks_udf import hopsworks_udf +from hsfs.hopsworks_udf import udf fg1 = feature_group.FeatureGroup( @@ -91,7 +91,7 @@ def test_save(self, mocker): feature_store_id ) - @hopsworks_udf(int) + @udf(int) def testFunction(col1): return col1 + 1 @@ -118,7 +118,7 @@ def test_get_transformation_fn(self, mocker): feature_store_id ) - @hopsworks_udf(int) + @udf(int) def testFunction1(col1): return col1 + 1 @@ -127,7 +127,7 @@ def testFunction1(col1): hopsworks_udf=testFunction1, ) - @hopsworks_udf(float) + @udf(float) def testFunction2(data2, statistics_data2): return data2 + 1 @@ -159,7 +159,7 @@ def test_get_transformation_fns(self, mocker): feature_store_id ) - @hopsworks_udf(int) + @udf(int) def testFunction1(col1): return col1 + 1 @@ -168,7 +168,7 @@ def testFunction1(col1): hopsworks_udf=testFunction1, ) - @hopsworks_udf(float) + @udf(float) def testFunction2(data2, statistics_data2): return data2 + 1 @@ -200,7 +200,7 @@ def test_delete(self, mocker): feature_store_id ) - @hopsworks_udf(int) + @udf(int) def testFunction1(col1): return col1 + 1 @@ -259,7 +259,7 @@ def test_compute_and_set_feature_statistics_no_split(self, mocker): feature_store_id ) - @hopsworks_udf(int) + @udf(int) def testFunction1(col1): return col1 + 1 @@ -318,7 +318,7 @@ def test_compute_and_set_feature_statistics_train_test_split(self, mocker): feature_store_id ) - @hopsworks_udf(int) + @udf(int) def testFunction1(col1): return col1 + 1 @@ -376,7 +376,7 @@ def test_get_and_set_feature_statistics_no_statistics_required(self, mocker): feature_store_id ) - @hopsworks_udf(int) + @udf(int) def testFunction1(col1): return col1 + 1 @@ -428,10 +428,13 @@ def test_get_and_set_feature_statistics_statistics_required(self, mocker): tf_engine = transformation_function_engine.TransformationFunctionEngine( feature_store_id ) + from hsfs.transformation_statistics import TransformationStatistics - @hopsworks_udf(int) - def testFunction1(col1, statistics_col1): - return col1 + statistics_col1.mean + stats = TransformationStatistics("col1") + + @udf(int) + def testFunction1(col1, statistics=stats): + return col1 + statistics.col1.mean tf1 = transformation_function.TransformationFunction( feature_store_id, diff --git a/python/tests/engine/test_python.py b/python/tests/engine/test_python.py index 4b883f8ed2..4796ad2cfe 100644 --- a/python/tests/engine/test_python.py +++ b/python/tests/engine/test_python.py @@ -36,7 +36,7 @@ from hsfs.constructor.hudi_feature_group_alias import HudiFeatureGroupAlias from hsfs.core import inode, job from hsfs.engine import python -from hsfs.hopsworks_udf import hopsworks_udf +from hsfs.hopsworks_udf import udf from hsfs.training_dataset_feature import TrainingDatasetFeature from polars.testing import assert_frame_equal as polars_assert_frame_equal @@ -3240,7 +3240,7 @@ def test_apply_transformation_function_pandas(self, mocker): engine._engine_type = "python" python_engine = python.Engine() - @hopsworks_udf(int) + @udf(int) def plus_one(col1): return col1 + 1 @@ -3280,7 +3280,7 @@ def test_apply_transformation_function_multiple_output(self, mocker): engine._engine_type = "python" python_engine = python.Engine() - @hopsworks_udf([int, int]) + @udf([int, int]) def plus_two(col1): return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col1 + 2}) @@ -3324,7 +3324,7 @@ def test_apply_transformation_function_multiple_input_output(self, mocker): engine._engine_type = "python" python_engine = python.Engine() - @hopsworks_udf([int, int]) + @udf([int, int]) def plus_two(col1, col2): return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2}) @@ -3368,7 +3368,7 @@ def test_apply_transformation_function_polars(self, mocker): engine._engine_type = "python" python_engine = python.Engine() - @hopsworks_udf(int) + @udf(int) def plus_one(col1): return col1 + 1 @@ -3896,7 +3896,10 @@ def test_materialization_kafka_skip_offsets(self, mocker): python_engine._write_dataframe_kafka( feature_group=fg, dataframe=df, - offline_write_options={"start_offline_materialization": True, "skip_offsets": True}, + offline_write_options={ + "start_offline_materialization": True, + "skip_offsets": True, + }, ) # Assert diff --git a/python/tests/engine/test_python_spark_transformation_functions.py b/python/tests/engine/test_python_spark_transformation_functions.py index 4929312bec..cf0d529611 100644 --- a/python/tests/engine/test_python_spark_transformation_functions.py +++ b/python/tests/engine/test_python_spark_transformation_functions.py @@ -31,7 +31,7 @@ from hsfs.client.exceptions import FeatureStoreException from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics from hsfs.engine import python, spark -from hsfs.hopsworks_udf import HopsworksUdf, hopsworks_udf +from hsfs.hopsworks_udf import HopsworksUdf, udf from pyspark.sql.types import ( BooleanType, DateType, @@ -148,15 +148,18 @@ def test_apply_builtin_minmax_from_backend(self, mocker): # Arrange tf_fun_source = ( - "import pandas as pd\nfrom hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics\n" - "from hsfs.hopsworks_udf import hopsworks_udf\n" - "@hopsworks_udf(float)\ndef min_max_scaler(feature : pd.Series, statistics_feature : FeatureDescriptiveStatistics) -> pd.Series:\n" - " return (feature - statistics_feature.min)/(statistics_feature.max-statistics_feature.min)\n" + "import numpy as np\nimport pandas as pd\nfrom hsfs.transformation_statistics import TransformationStatistics\n" + "from hsfs.hopsworks_udf import udf\n" + 'feature_statistics = TransformationStatistics("feature")\n' + "@udf(float)\n" + "def min_max_scaler(feature: pd.Series, statistics = feature_statistics) -> pd.Series:\n" + " return (feature - statistics.feature.min) / (statistics.feature.max - statistics.feature.min)" ) udf_response = { "sourceCode": tf_fun_source, "outputTypes": "double", "transformationFeatures": "", + "statisticsArgumentNames": "feature", "name": "min_max_scaler", } @@ -283,15 +286,18 @@ def test_apply_builtin_standard_scaler_from_backend(self, mocker): # Arrange tf_fun_source = ( - "import pandas as pd\nfrom hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics\n" - "from hsfs.hopsworks_udf import hopsworks_udf\n" - "@hopsworks_udf(float)\ndef standard_scaler(feature : pd.Series, statistics_feature : FeatureDescriptiveStatistics) -> pd.Series:\n" - " return (feature - statistics_feature.mean)/statistics_feature.stddev\n" + "import numpy as np\nimport pandas as pd\nfrom hsfs.transformation_statistics import TransformationStatistics\n" + "from hsfs.hopsworks_udf import udf\n" + 'feature_statistics = TransformationStatistics("feature")\n' + "@udf(float)\n" + "def standard_scaler(feature: pd.Series, statistics = feature_statistics) -> pd.Series:\n" + " return (feature - statistics.feature.mean) / statistics.feature.stddev" ) udf_response = { "sourceCode": tf_fun_source, "outputTypes": "double", "transformationFeatures": "", + "statisticsArgumentNames": "feature", "name": "standard_scaler", } @@ -421,15 +427,19 @@ def test_apply_builtin_robust_scaler_from_backend(self, mocker): # Arrange tf_fun_source = ( - "import pandas as pd\nfrom hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics\n" - "from hsfs.hopsworks_udf import hopsworks_udf\n" - "@hopsworks_udf(float)\ndef robust_scaler(feature : pd.Series, statistics_feature : FeatureDescriptiveStatistics) -> pd.Series:\n" - " return (feature - statistics_feature.percentiles[49])/(statistics_feature.percentiles[74]-statistics_feature.percentiles[24])\n" + "import numpy as np\nimport pandas as pd\nfrom hsfs.transformation_statistics import TransformationStatistics\n" + "from hsfs.hopsworks_udf import udf\n" + 'feature_statistics = TransformationStatistics("feature")\n' + "@udf(float)\n" + "def robust_scaler(feature: pd.Series, statistics = feature_statistics) -> pd.Series:\n" + " return (feature - statistics.feature.percentiles[49]) / (statistics.feature.percentiles[74] - " + "statistics.feature.percentiles[24])" ) udf_response = { "sourceCode": tf_fun_source, "outputTypes": "double", "transformationFeatures": "", + "statisticsArgumentNames": "feature", "name": "robust_scaler", } @@ -561,7 +571,7 @@ def test_apply_plus_one_int(self, mocker): ) # Arrange - @hopsworks_udf(int) + @udf(int) def tf_fun(col_0): return col_0 + 1 @@ -619,7 +629,7 @@ def test_apply_plus_one_str(self, mocker): ) # Arrange - @hopsworks_udf(str) + @udf(str) def tf_fun(col_0): return col_0 + "1" @@ -676,7 +686,7 @@ def test_apply_plus_one_double(self, mocker): spark_df = spark_engine._spark_session.createDataFrame(df, schema=schema) # Arrange - @hopsworks_udf(float) + @udf(float) def tf_fun(col_0): return col_0 + 1.0 @@ -748,7 +758,7 @@ def test_apply_plus_one_datetime_no_tz(self, mocker): ) # Arrange - @hopsworks_udf(datetime.datetime) + @udf(datetime.datetime) def tf_fun(col_0): import datetime @@ -823,7 +833,7 @@ def test_apply_plus_one_datetime_tz_utc(self, mocker): ) # Arrange - @hopsworks_udf(datetime.datetime) + @udf(datetime.datetime) def tf_fun(col_0) -> datetime.datetime: import datetime @@ -901,7 +911,7 @@ def test_apply_plus_one_datetime_tz_pst(self, mocker): ) # Arrange - @hopsworks_udf(datetime.datetime) + @udf(datetime.datetime) def tf_fun(col_0) -> datetime.datetime: import datetime @@ -979,7 +989,7 @@ def test_apply_plus_one_datetime_ts_none(self, mocker): ) # Arrange - @hopsworks_udf(datetime.datetime) + @udf(datetime.datetime) def tf_fun(col_0) -> datetime.datetime: import datetime @@ -1053,7 +1063,7 @@ def test_apply_plus_one_date(self, mocker): ) # Arrange - @hopsworks_udf(datetime.date) + @udf(datetime.date) def tf_fun(col_0): import datetime @@ -1079,7 +1089,7 @@ def test_apply_plus_one_invalid_type(self, mocker): # Arrange with pytest.raises(FeatureStoreException) as e_info: - @hopsworks_udf(list) + @udf(list) def tf_fun(a): return a + 1 diff --git a/python/tests/engine/test_spark.py b/python/tests/engine/test_spark.py index 09300059f3..42e0abe4e6 100644 --- a/python/tests/engine/test_spark.py +++ b/python/tests/engine/test_spark.py @@ -34,7 +34,7 @@ from hsfs.constructor import hudi_feature_group_alias, query from hsfs.core import training_dataset_engine from hsfs.engine import spark -from hsfs.hopsworks_udf import hopsworks_udf +from hsfs.hopsworks_udf import udf from hsfs.training_dataset_feature import TrainingDatasetFeature from pyspark.sql import DataFrame from pyspark.sql.types import ( @@ -2668,7 +2668,7 @@ def test_write_training_dataset_splits(self, mocker): spark_engine = spark.Engine() - @hopsworks_udf(int) + @udf(int) def plus_one(col1): return col1 + 1 @@ -2717,7 +2717,7 @@ def test_write_training_dataset_splits_to_df(self, mocker): spark_engine = spark.Engine() - @hopsworks_udf(int) + @udf(int) def plus_one(col1): return col1 + 1 @@ -4328,7 +4328,7 @@ def test_apply_transformation_function_single_output(self, mocker): engine._engine_type = "spark" spark_engine = spark.Engine() - @hopsworks_udf(int) + @udf(int) def plus_one(col1): return col1 + 1 @@ -4388,7 +4388,7 @@ def test_apply_transformation_function_multiple_output(self, mocker): engine._engine_type = "spark" spark_engine = spark.Engine() - @hopsworks_udf([int, int]) + @udf([int, int]) def plus_two(col1): return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col1 + 2}) @@ -4449,7 +4449,7 @@ def test_apply_transformation_function_multiple_input_output(self, mocker): engine._engine_type = "spark" spark_engine = spark.Engine() - @hopsworks_udf([int, int]) + @udf([int, int]) def test(col1, col2): return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2}) diff --git a/python/tests/fixtures/feature_view_fixtures.json b/python/tests/fixtures/feature_view_fixtures.json index da5c7766ed..a0a9f6864d 100644 --- a/python/tests/fixtures/feature_view_fixtures.json +++ b/python/tests/fixtures/feature_view_fixtures.json @@ -690,10 +690,11 @@ "version": 2, "featurestoreId": 11, "hopsworksUdf":{ - "sourceCode": "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n return data1 + statistics_data1.mean\n", + "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n return data1 + statistics.data1.mean\n", "name": "add_mean_fs", "outputTypes":"double", - "transformationFeatures":"data" + "transformationFeatures":"data", + "statisticsArgumentNames":"data1" } }, { @@ -701,7 +702,7 @@ "version": 1, "featurestoreId": 11, "hopsworksUdf":{ - "sourceCode": "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", + "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", "name": "add_one_fs", "outputTypes":"double", "transformationFeatures":"col1" @@ -929,10 +930,11 @@ "version": 2, "featurestoreId": 11, "hopsworksUdf":{ - "sourceCode": "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n return data1 + statistics_data1.mean\n", + "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n return data1 + statistics.data1.mean\n", "name": "add_mean_fs", "outputTypes":"double", - "transformationFeatures":"data" + "transformationFeatures":"data", + "statisticsArgumentNames":"data1" } }, { @@ -940,7 +942,7 @@ "version": 1, "featurestoreId": 11, "hopsworksUdf":{ - "sourceCode": "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", + "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", "name": "add_one_fs", "outputTypes":"double", "transformationFeatures":"col1" diff --git a/python/tests/fixtures/transformation_function_fixtures.json b/python/tests/fixtures/transformation_function_fixtures.json index 169d779bd6..96fac98fc8 100644 --- a/python/tests/fixtures/transformation_function_fixtures.json +++ b/python/tests/fixtures/transformation_function_fixtures.json @@ -5,7 +5,7 @@ "version": 2, "featurestoreId": 11, "hopsworksUdf":{ - "sourceCode": "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", + "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", "name": "add_one_fs", "outputTypes":"double", "transformationFeatures":"col1" @@ -18,10 +18,11 @@ "version": 2, "featurestoreId": 11, "hopsworksUdf":{ - "sourceCode": "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n return data1 + statistics_data1.mean\n", + "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n return data1 + statistics.data1.mean\n", "name": "add_mean_fs", "outputTypes":"double", - "transformationFeatures":"data" + "transformationFeatures":"data", + "statisticsArgumentNames":"data1" } } }, @@ -31,10 +32,11 @@ "version": 2, "featurestoreId": 11, "hopsworksUdf":{ - "sourceCode": "\n@hopsworks_udf(str)\ndef test_func(data1 : pd.Series, statistics_data1, data2, statistics_data2, data3):\n return data1 + statistics_data1.mean\n", + "sourceCode": "\n@udf(str)\ndef test_func(data1 : pd.Series, data2, data3, statistics=stats):\n return data1 + statistics.data1.mean\n", "name": "test_func", "outputTypes":"string", - "transformationFeatures":"feature1, feature2, feature3" + "transformationFeatures":"feature1, feature2, feature3", + "statisticsArgumentNames":"data1, data2" } } }, @@ -44,10 +46,11 @@ "version": 2, "featurestoreId": 11, "hopsworksUdf":{ - "sourceCode": "\n@hopsworks_udf(str, float)\ndef test_func(data1 : pd.Series, statistics_data1, data2, statistics_data2, data3):\n return pd.DataFrame('col1': ['a', 'b'], 'col2':[1,2])\n", + "sourceCode": "\n@udf(str, float)\ndef test_func(data1 : pd.Series, data2, data3, statistics=stats):\n return pd.DataFrame('col1': ['a', 'b'], 'col2':[1,2])\n", "name": "test_func", "outputTypes":"string, double", - "transformationFeatures":"feature1, feature2, feature3" + "transformationFeatures":"feature1, feature2, feature3", + "statisticsArgumentNames":"data1, data2" } } }, @@ -60,10 +63,11 @@ "version": 2, "featurestoreId": 11, "hopsworksUdf":{ - "sourceCode": "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n return data1 + statistics_data1.mean\n", + "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n return data1 + statistics.data1.mean\n", "name": "add_mean_fs", "outputTypes":"double", - "transformationFeatures":"data" + "transformationFeatures":"data", + "statisticsArgumentNames":"data1" } }, { @@ -71,7 +75,7 @@ "version": 1, "featurestoreId": 11, "hopsworksUdf":{ - "sourceCode": "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", + "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", "name": "add_one_fs", "outputTypes":"double", "transformationFeatures":"col1" @@ -89,10 +93,11 @@ "version": 2, "featurestoreId": 11, "hopsworksUdf":{ - "sourceCode": "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n return data1 + statistics_data1.mean\n", + "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n return data1 + statistics.data1.mean\n", "name": "add_mean_fs", "outputTypes":"double", - "transformationFeatures":"data" + "transformationFeatures":"data", + "statisticsArgumentNames":"data1" } } ] diff --git a/python/tests/test_feature_view.py b/python/tests/test_feature_view.py index e8e36c0f1e..a45093126b 100644 --- a/python/tests/test_feature_view.py +++ b/python/tests/test_feature_view.py @@ -18,7 +18,7 @@ from hsfs import feature_view, training_dataset_feature from hsfs.constructor import fs_query, query from hsfs.feature_store import FeatureStore -from hsfs.hopsworks_udf import hopsworks_udf +from hsfs.hopsworks_udf import udf class TestFeatureView: @@ -100,11 +100,11 @@ def test_from_response_json_transformation_function(self, mocker, backend_fixtur ) assert ( fv.transformation_functions[0].hopsworks_udf._function_source - == "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n return data1 + statistics_data1.mean\n" + == "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n return data1 + statistics.data1.mean\n" ) assert ( fv.transformation_functions[1].hopsworks_udf._function_source - == "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n" + == "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n" ) assert len(fv.schema) == 2 assert isinstance(fv.schema[0], training_dataset_feature.TrainingDatasetFeature) @@ -144,7 +144,7 @@ def test_transformation_function_instances(self, mocker, backend_fixtures): # Act q = fs_query.FsQuery.from_response_json(json) - @hopsworks_udf(int) + @udf(int) def test(col1): return col1 + 1 diff --git a/python/tests/test_helpers/transformation_test_helper.py b/python/tests/test_helpers/transformation_test_helper.py index 8b81c48fde..2a502692a1 100644 --- a/python/tests/test_helpers/transformation_test_helper.py +++ b/python/tests/test_helpers/transformation_test_helper.py @@ -1,5 +1,11 @@ import pandas as pd -from hsfs.statistics import FeatureDescriptiveStatistics +from hsfs.transformation_statistics import TransformationStatistics + + +stats_arg1 = TransformationStatistics("arg1") +stats_arg1_arg3 = TransformationStatistics("arg1", "arg3") +stats_arg1_arg2 = TransformationStatistics("arg1", "arg2") +stats_arg3 = TransformationStatistics("arg3") def test_function(): @@ -10,7 +16,7 @@ def test_function_one_argument(arg1): pass -def test_function_one_argument_with_statistics(arg1, statistics_arg1): +def test_function_one_argument_with_statistics(arg1, statistics=stats_arg1): pass @@ -19,7 +25,7 @@ def test_function_one_argument_with_typehints(arg1: pd.Series): def test_function_one_argument_with_statistics_and_typehints( - arg1: pd.Series, statistics_arg1: FeatureDescriptiveStatistics + arg1: pd.Series, statistics=stats_arg1 ): pass @@ -29,7 +35,7 @@ def test_function_multiple_argument(arg1, arg2): def test_function_multiple_argument_with_statistics( - arg1, arg2, arg3, statistics_arg1, statistics_arg3 + arg1, arg2, arg3, statistics=stats_arg1_arg3 ): pass @@ -39,39 +45,25 @@ def test_function_multiple_argument_with_typehints(arg1: pd.Series, arg2: pd.Ser def test_function_multiple_argument_with_statistics_and_typehints( - arg1: pd.Series, - arg2: pd.Series, - statistics_arg1: FeatureDescriptiveStatistics, - statistics_arg2: FeatureDescriptiveStatistics, + arg1: pd.Series, arg2: pd.Series, statistics=stats_arg1_arg2 ): pass def test_function_multiple_argument_with_mixed_statistics_and_typehints( - arg1: pd.Series, - arg2, - arg3, - statistics_arg1, - statistics_arg3: FeatureDescriptiveStatistics, + arg1: pd.Series, arg2, arg3, statistics=stats_arg1_arg3 ): pass def test_function_multiple_argument_all_parameter_with_spaces( - arg1: pd.Series, - arg2, - statistics_arg1, - statistics_arg2: FeatureDescriptiveStatistics, + arg1: pd.Series, arg2, statistics=stats_arg1_arg2 ): pass def test_function_multiple_argument_all_parameter_multiline( - arg1: pd.Series, - arg2, - statistics_arg1, - arg3, - statistics_arg3: FeatureDescriptiveStatistics, + arg1: pd.Series, arg2, arg3, statistics=stats_arg1_arg3 ): pass @@ -79,14 +71,11 @@ def test_function_multiple_argument_all_parameter_multiline( def test_function_multiple_argument_all_parameter_multiline_with_comments( arg1: pd.Series, # Test Comment arg2, - statistics_arg1, # Test Comment - arg3, - statistics_arg3: FeatureDescriptiveStatistics, + arg3, # Test Comment + statistics=stats_arg1_arg3, # Test Comment ) -> pd.DataFrame: # Test Comment pass -def test_function_statistics_invalid( - arg1: pd.Series, statistics_arg3: FeatureDescriptiveStatistics -): +def test_function_statistics_invalid(arg1: pd.Series, statistics=stats_arg3): pass diff --git a/python/tests/test_hopswork_udf.py b/python/tests/test_hopswork_udf.py index 04dab45309..402c1857e1 100644 --- a/python/tests/test_hopswork_udf.py +++ b/python/tests/test_hopswork_udf.py @@ -19,7 +19,7 @@ import pandas as pd import pytest from hsfs.client.exceptions import FeatureStoreException -from hsfs.hopsworks_udf import HopsworksUdf, TransformationFeature, hopsworks_udf +from hsfs.hopsworks_udf import HopsworksUdf, TransformationFeature, udf class TestHopsworksUdf: @@ -95,14 +95,14 @@ def test_get_module_imports(self): "python/tests/test_helpers/transformation_test_helper.py" ) == [ "import pandas as pd", - "from hsfs.statistics import FeatureDescriptiveStatistics", + "from hsfs.transformation_statistics import TransformationStatistics", ] def test_extract_source_code(self): from test_helpers.transformation_test_helper import test_function assert """import pandas as pd -from hsfs.statistics import FeatureDescriptiveStatistics +from hsfs.transformation_statistics import TransformationStatistics def test_function(): return True""" == HopsworksUdf._extract_source_code(test_function).strip() @@ -110,8 +110,7 @@ def test_extract_function_arguments_no_arguments(self): from test_helpers.transformation_test_helper import test_function with pytest.raises(FeatureStoreException) as exception: - function_source = HopsworksUdf._extract_source_code(test_function) - HopsworksUdf._extract_function_arguments(function_source) + HopsworksUdf._extract_function_arguments(test_function) assert ( str(exception.value) @@ -121,8 +120,9 @@ def test_extract_function_arguments_no_arguments(self): def test_extract_function_arguments_one_argument(self): from test_helpers.transformation_test_helper import test_function_one_argument - function_source = HopsworksUdf._extract_source_code(test_function_one_argument) - function_argument = HopsworksUdf._extract_function_arguments(function_source) + function_argument = HopsworksUdf._extract_function_arguments( + test_function_one_argument + ) assert function_argument == [ TransformationFeature(feature_name="arg1", statistic_argument_name=None) @@ -133,15 +133,12 @@ def test_extract_function_arguments_one_argument_with_statistics(self): test_function_one_argument_with_statistics, ) - function_source = HopsworksUdf._extract_source_code( + function_argument = HopsworksUdf._extract_function_arguments( test_function_one_argument_with_statistics ) - function_argument = HopsworksUdf._extract_function_arguments(function_source) assert function_argument == [ - TransformationFeature( - feature_name="arg1", statistic_argument_name="statistics_arg1" - ) + TransformationFeature(feature_name="arg1", statistic_argument_name="arg1") ] def test_extract_function_arguments_one_argument_with_typehint(self): @@ -149,10 +146,9 @@ def test_extract_function_arguments_one_argument_with_typehint(self): test_function_one_argument_with_typehints, ) - function_source = HopsworksUdf._extract_source_code( + function_argument = HopsworksUdf._extract_function_arguments( test_function_one_argument_with_typehints ) - function_argument = HopsworksUdf._extract_function_arguments(function_source) assert function_argument == [ TransformationFeature(feature_name="arg1", statistic_argument_name=None) @@ -165,15 +161,12 @@ def test_extract_function_arguments_one_argument_with_statistics_and_typehints( test_function_one_argument_with_statistics_and_typehints, ) - function_source = HopsworksUdf._extract_source_code( + function_argument = HopsworksUdf._extract_function_arguments( test_function_one_argument_with_statistics_and_typehints ) - function_argument = HopsworksUdf._extract_function_arguments(function_source) assert function_argument == [ - TransformationFeature( - feature_name="arg1", statistic_argument_name="statistics_arg1" - ) + TransformationFeature(feature_name="arg1", statistic_argument_name="arg1") ] def test_extract_function_arguments_multiple_argument(self): @@ -181,10 +174,9 @@ def test_extract_function_arguments_multiple_argument(self): test_function_multiple_argument, ) - function_source = HopsworksUdf._extract_source_code( + function_argument = HopsworksUdf._extract_function_arguments( test_function_multiple_argument ) - function_argument = HopsworksUdf._extract_function_arguments(function_source) assert function_argument == [ TransformationFeature(feature_name="arg1", statistic_argument_name=None), @@ -196,19 +188,14 @@ def test_extract_function_arguments_multiple_argument_with_statistics(self): test_function_multiple_argument_with_statistics, ) - function_source = HopsworksUdf._extract_source_code( + function_argument = HopsworksUdf._extract_function_arguments( test_function_multiple_argument_with_statistics ) - function_argument = HopsworksUdf._extract_function_arguments(function_source) assert function_argument == [ - TransformationFeature( - feature_name="arg1", statistic_argument_name="statistics_arg1" - ), + TransformationFeature(feature_name="arg1", statistic_argument_name="arg1"), TransformationFeature(feature_name="arg2", statistic_argument_name=None), - TransformationFeature( - feature_name="arg3", statistic_argument_name="statistics_arg3" - ), + TransformationFeature(feature_name="arg3", statistic_argument_name="arg3"), ] def test_extract_function_arguments_multiple_argument_with_typehints(self): @@ -216,10 +203,9 @@ def test_extract_function_arguments_multiple_argument_with_typehints(self): test_function_multiple_argument_with_typehints, ) - function_source = HopsworksUdf._extract_source_code( + function_argument = HopsworksUdf._extract_function_arguments( test_function_multiple_argument_with_typehints ) - function_argument = HopsworksUdf._extract_function_arguments(function_source) assert function_argument == [ TransformationFeature(feature_name="arg1", statistic_argument_name=None), @@ -233,18 +219,13 @@ def test_extract_function_arguments_multiple_argument_with_statistics_and_typehi test_function_multiple_argument_with_statistics_and_typehints, ) - function_source = HopsworksUdf._extract_source_code( + function_argument = HopsworksUdf._extract_function_arguments( test_function_multiple_argument_with_statistics_and_typehints ) - function_argument = HopsworksUdf._extract_function_arguments(function_source) assert function_argument == [ - TransformationFeature( - feature_name="arg1", statistic_argument_name="statistics_arg1" - ), - TransformationFeature( - feature_name="arg2", statistic_argument_name="statistics_arg2" - ), + TransformationFeature(feature_name="arg1", statistic_argument_name="arg1"), + TransformationFeature(feature_name="arg2", statistic_argument_name="arg2"), ] def test_extract_function_arguments_multiple_argument_with_mixed_statistics_and_typehints( @@ -254,19 +235,14 @@ def test_extract_function_arguments_multiple_argument_with_mixed_statistics_and_ test_function_multiple_argument_with_mixed_statistics_and_typehints, ) - function_source = HopsworksUdf._extract_source_code( + function_argument = HopsworksUdf._extract_function_arguments( test_function_multiple_argument_with_mixed_statistics_and_typehints ) - function_argument = HopsworksUdf._extract_function_arguments(function_source) assert function_argument == [ - TransformationFeature( - feature_name="arg1", statistic_argument_name="statistics_arg1" - ), + TransformationFeature(feature_name="arg1", statistic_argument_name="arg1"), TransformationFeature(feature_name="arg2", statistic_argument_name=None), - TransformationFeature( - feature_name="arg3", statistic_argument_name="statistics_arg3" - ), + TransformationFeature(feature_name="arg3", statistic_argument_name="arg3"), ] def test_extract_function_arguments_multiple_argument_all_parameter_with_spaces( @@ -276,18 +252,13 @@ def test_extract_function_arguments_multiple_argument_all_parameter_with_spaces( test_function_multiple_argument_all_parameter_with_spaces, ) - function_source = HopsworksUdf._extract_source_code( + function_argument = HopsworksUdf._extract_function_arguments( test_function_multiple_argument_all_parameter_with_spaces ) - function_argument = HopsworksUdf._extract_function_arguments(function_source) assert function_argument == [ - TransformationFeature( - feature_name="arg1", statistic_argument_name="statistics_arg1" - ), - TransformationFeature( - feature_name="arg2", statistic_argument_name="statistics_arg2" - ), + TransformationFeature(feature_name="arg1", statistic_argument_name="arg1"), + TransformationFeature(feature_name="arg2", statistic_argument_name="arg2"), ] def test_extract_function_arguments_multiple_argument_all_parameter_multiline(self): @@ -295,19 +266,14 @@ def test_extract_function_arguments_multiple_argument_all_parameter_multiline(se test_function_multiple_argument_all_parameter_multiline, ) - function_source = HopsworksUdf._extract_source_code( + function_argument = HopsworksUdf._extract_function_arguments( test_function_multiple_argument_all_parameter_multiline ) - function_argument = HopsworksUdf._extract_function_arguments(function_source) assert function_argument == [ - TransformationFeature( - feature_name="arg1", statistic_argument_name="statistics_arg1" - ), + TransformationFeature(feature_name="arg1", statistic_argument_name="arg1"), TransformationFeature(feature_name="arg2", statistic_argument_name=None), - TransformationFeature( - feature_name="arg3", statistic_argument_name="statistics_arg3" - ), + TransformationFeature(feature_name="arg3", statistic_argument_name="arg3"), ] def test_extract_function_arguments_multiple_argumen_all_parameter_multiline_with_comments( @@ -317,19 +283,14 @@ def test_extract_function_arguments_multiple_argumen_all_parameter_multiline_wit test_function_multiple_argument_all_parameter_multiline_with_comments, ) - function_source = HopsworksUdf._extract_source_code( + function_argument = HopsworksUdf._extract_function_arguments( test_function_multiple_argument_all_parameter_multiline_with_comments ) - function_argument = HopsworksUdf._extract_function_arguments(function_source) assert function_argument == [ - TransformationFeature( - feature_name="arg1", statistic_argument_name="statistics_arg1" - ), + TransformationFeature(feature_name="arg1", statistic_argument_name="arg1"), TransformationFeature(feature_name="arg2", statistic_argument_name=None), - TransformationFeature( - feature_name="arg3", statistic_argument_name="statistics_arg3" - ), + TransformationFeature(feature_name="arg3", statistic_argument_name="arg3"), ] def test_extract_function_arguments_statistics_invalid(self): @@ -338,14 +299,11 @@ def test_extract_function_arguments_statistics_invalid(self): ) with pytest.raises(FeatureStoreException) as exception: - function_source = HopsworksUdf._extract_source_code( - test_function_statistics_invalid - ) - HopsworksUdf._extract_function_arguments(function_source) + HopsworksUdf._extract_function_arguments(test_function_statistics_invalid) assert ( str(exception.value) - == "No argument corresponding to statistics parameter 'statistics_arg3' present in function definition." + == "No argument corresponding to statistics parameter 'arg3' present in function definition." ) def test_format_source_code(self): @@ -356,13 +314,11 @@ def test_format_source_code(self): function_source = HopsworksUdf._extract_source_code( test_function_multiple_argument_all_parameter_multiline_with_comments ) - function_argument = HopsworksUdf._extract_function_arguments(function_source) - print("\n") - print(function_argument) + formated_source, module_imports = HopsworksUdf._format_source_code( - function_source, function_argument + function_source ) - print(formated_source) + assert ( formated_source.strip() == """def test_function_multiple_argument_all_parameter_multiline_with_comments(arg1, arg2, arg3): @@ -370,21 +326,21 @@ def test_format_source_code(self): ) def test_generate_output_column_names_one_argument_one_output_type(self): - @hopsworks_udf(int) + @udf(int) def test_func(col1): return col1 + 1 assert test_func._get_output_column_names() == ["test_func_col1_"] def test_generate_output_column_names_multiple_argument_one_output_type(self): - @hopsworks_udf(int) + @udf(int) def test_func(col1, col2, col3): return col1 + 1 assert test_func._get_output_column_names() == ["test_func_col1-col2-col3_"] def test_generate_output_column_names_single_argument_multiple_output_type(self): - @hopsworks_udf([int, float, int]) + @udf([int, float, int]) def test_func(col1): return pd.DataFrame( {"col1": [col1 + 1], "col2": [col1 + 1], "col3": [col1 + 1]} @@ -397,7 +353,7 @@ def test_func(col1): ] def test_generate_output_column_names_multiple_argument_multiple_output_type(self): - @hopsworks_udf([int, float, int]) + @udf([int, float, int]) def test_func(col1, col2, col3): return pd.DataFrame( {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]} @@ -410,7 +366,7 @@ def test_func(col1, col2, col3): ] def test_create_pandas_udf_return_schema_from_list_one_output_type(self): - @hopsworks_udf(int) + @udf(int) def test_func(col1): return col1 + 1 @@ -419,7 +375,7 @@ def test_func(col1): def test_create_pandas_udf_return_schema_from_list_one_argument_multiple_output_type( self, ): - @hopsworks_udf([int, float, str, date, datetime, time, bool]) + @udf([int, float, str, date, datetime, time, bool]) def test_func(col1): return pd.DataFrame( { @@ -438,7 +394,7 @@ def test_func(col1): ) def test_hopsworks_wrapper_single_output(self): - @hopsworks_udf(int) + @udf(int) def test_func(col1): return col1 + 1 @@ -452,7 +408,7 @@ def test_func(col1): assert result.values.tolist() == [2, 3, 4, 5] def test_hopsworks_wrapper_multiple_output(self): - @hopsworks_udf([int, float]) + @udf([int, float]) def test_func(col1, col2): return pd.DataFrame({"out1": col1 + 1, "out2": col2 + 2}) @@ -470,7 +426,7 @@ def test_func(col1, col2): assert result.values.tolist() == [[2, 12], [3, 22], [4, 32], [5, 42]] def test_HopsworkUDf_call_one_argument(self): - @hopsworks_udf(int) + @udf(int) def test_func(col1): return col1 + 1 @@ -481,23 +437,37 @@ def test_func(col1): assert test_func("new_feature").statistics_features == [] def test_HopsworkUDf_call_one_argument_statistics(self): - @hopsworks_udf(int) - def test_func(col1, statistics_col1): - return col1 + statistics_col1 + from hsfs.transformation_statistics import TransformationStatistics + + stats = TransformationStatistics("col1") + + @udf(int) + def test_func(col1, statistics=stats): + return col1 + statistics.col1.mean assert test_func.transformation_features == ["col1"] assert test_func.statistics_features == ["col1"] + assert test_func._statistics_argument_names == ["col1"] assert test_func("new_feature").transformation_features == ["new_feature"] assert test_func("new_feature").statistics_features == ["new_feature"] + assert test_func("new_feature")._statistics_argument_names == ["col1"] def test_HopsworkUDf_call_multiple_argument_statistics(self): - @hopsworks_udf(int) - def test_func(col1, statistics_col1, col2, col3, statistics_col3): - return col1 + statistics_col1 + from hsfs.transformation_statistics import TransformationStatistics + + stats = TransformationStatistics("col1", "col3") + + @udf(int) + def test_func(col1, col2, col3, statistics=stats): + return col1 + statistics.col1.mean + statistics.col3.mean assert test_func.transformation_features == ["col1", "col2", "col3"] assert test_func.statistics_features == ["col1", "col3"] assert test_func("f1", "f2", "f3").transformation_features == ["f1", "f2", "f3"] assert test_func("f1", "f2", "f3").statistics_features == ["f1", "f3"] + assert test_func("f1", "f2", "f3")._statistics_argument_names == [ + "col1", + "col3", + ] diff --git a/python/tests/test_transformation_function.py b/python/tests/test_transformation_function.py index b54fbdbe6b..bfc2f125d0 100644 --- a/python/tests/test_transformation_function.py +++ b/python/tests/test_transformation_function.py @@ -17,7 +17,7 @@ import pytest from hsfs.client.exceptions import FeatureStoreException -from hsfs.hopsworks_udf import hopsworks_udf +from hsfs.hopsworks_udf import udf from hsfs.transformation_function import TransformationFunction @@ -36,13 +36,14 @@ def test_from_response_json_one_argument_no_statistics(self, backend_fixtures): assert tf._featurestore_id == 11 assert tf.version == 2 assert tf.hopsworks_udf.function_name == "add_one_fs" - assert tf.hopsworks_udf.output_types == ["double"] + assert tf.hopsworks_udf.return_types == ["double"] assert not tf.hopsworks_udf.statistics_required assert tf.hopsworks_udf.transformation_features == ["col1"] assert tf.hopsworks_udf.statistics_features == [] + assert tf.hopsworks_udf._statistics_argument_names == [] assert ( tf.hopsworks_udf._function_source - == "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n" + == "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n" ) def test_from_response_json_one_argument_with_statistics(self, backend_fixtures): @@ -59,13 +60,14 @@ def test_from_response_json_one_argument_with_statistics(self, backend_fixtures) assert tf._featurestore_id == 11 assert tf.version == 2 assert tf.hopsworks_udf.function_name == "add_mean_fs" - assert tf.hopsworks_udf.output_types == ["double"] + assert tf.hopsworks_udf.return_types == ["double"] assert tf.hopsworks_udf.statistics_required assert tf.hopsworks_udf.transformation_features == ["data"] assert tf.hopsworks_udf.statistics_features == ["data"] + assert tf.hopsworks_udf._statistics_argument_names == ["data1"] assert ( tf.hopsworks_udf._function_source - == "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n return data1 + statistics_data1.mean\n" + == "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n return data1 + statistics.data1.mean\n" ) def test_from_response_json_multiple_argument_with_statistics( @@ -84,7 +86,7 @@ def test_from_response_json_multiple_argument_with_statistics( assert tf._featurestore_id == 11 assert tf.version == 2 assert tf.hopsworks_udf.function_name == "test_func" - assert tf.hopsworks_udf.output_types == ["string"] + assert tf.hopsworks_udf.return_types == ["string"] assert tf.hopsworks_udf.statistics_required assert tf.hopsworks_udf.transformation_features == [ "feature1", @@ -92,9 +94,10 @@ def test_from_response_json_multiple_argument_with_statistics( "feature3", ] assert tf.hopsworks_udf.statistics_features == ["feature1", "feature2"] + assert tf.hopsworks_udf._statistics_argument_names == ["data1", "data2"] assert ( tf.hopsworks_udf._function_source - == "\n@hopsworks_udf(str)\ndef test_func(data1 : pd.Series, statistics_data1, data2, statistics_data2, data3):\n return data1 + statistics_data1.mean\n" + == "\n@udf(str)\ndef test_func(data1 : pd.Series, data2, data3, statistics=stats):\n return data1 + statistics.data1.mean\n" ) def test_from_response_json_multiple_return_type_functions(self, backend_fixtures): @@ -111,7 +114,7 @@ def test_from_response_json_multiple_return_type_functions(self, backend_fixture assert tf._featurestore_id == 11 assert tf.version == 2 assert tf.hopsworks_udf.function_name == "test_func" - assert tf.hopsworks_udf.output_types == ["string", "double"] + assert tf.hopsworks_udf.return_types == ["string", "double"] assert tf.hopsworks_udf.statistics_required assert tf.hopsworks_udf.transformation_features == [ "feature1", @@ -119,9 +122,10 @@ def test_from_response_json_multiple_return_type_functions(self, backend_fixture "feature3", ] assert tf.hopsworks_udf.statistics_features == ["feature1", "feature2"] + assert tf.hopsworks_udf._statistics_argument_names == ["data1", "data2"] assert ( tf.hopsworks_udf._function_source - == "\n@hopsworks_udf(str, float)\ndef test_func(data1 : pd.Series, statistics_data1, data2, statistics_data2, data3):\n return pd.DataFrame('col1': ['a', 'b'], 'col2':[1,2])\n" + == "\n@udf(str, float)\ndef test_func(data1 : pd.Series, data2, data3, statistics=stats):\n return pd.DataFrame('col1': ['a', 'b'], 'col2':[1,2])\n" ) def test_from_response_json_list_empty(self, backend_fixtures): @@ -148,13 +152,14 @@ def test_from_response_json_list(self, backend_fixtures): assert tf._featurestore_id == 11 assert tf.version == 2 assert tf.hopsworks_udf.function_name == "add_mean_fs" - assert tf.hopsworks_udf.output_types == ["double"] + assert tf.hopsworks_udf.return_types == ["double"] assert tf.hopsworks_udf.statistics_required assert tf.hopsworks_udf.transformation_features == ["data"] assert tf.hopsworks_udf.statistics_features == ["data"] + assert tf.hopsworks_udf._statistics_argument_names == ["data1"] assert ( tf.hopsworks_udf._function_source - == "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n return data1 + statistics_data1.mean\n" + == "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n return data1 + statistics.data1.mean\n" ) tf = tf_list[1] @@ -162,13 +167,14 @@ def test_from_response_json_list(self, backend_fixtures): assert tf._featurestore_id == 11 assert tf.version == 1 assert tf.hopsworks_udf.function_name == "add_one_fs" - assert tf.hopsworks_udf.output_types == ["double"] + assert tf.hopsworks_udf.return_types == ["double"] assert not tf.hopsworks_udf.statistics_required assert tf.hopsworks_udf.transformation_features == ["col1"] assert tf.hopsworks_udf.statistics_features == [] + assert tf.hopsworks_udf._statistics_argument_names == [] assert ( tf.hopsworks_udf._function_source - == "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n" + == "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n" ) def test_from_response_json_list_one_argument(self, backend_fixtures): @@ -186,13 +192,14 @@ def test_from_response_json_list_one_argument(self, backend_fixtures): assert tf._featurestore_id == 11 assert tf.version == 2 assert tf.hopsworks_udf.function_name == "add_mean_fs" - assert tf.hopsworks_udf.output_types == ["double"] + assert tf.hopsworks_udf.return_types == ["double"] assert tf.hopsworks_udf.statistics_required assert tf.hopsworks_udf.transformation_features == ["data"] assert tf.hopsworks_udf.statistics_features == ["data"] + assert tf.hopsworks_udf._statistics_argument_names == ["data1"] assert ( tf.hopsworks_udf._function_source - == "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n return data1 + statistics_data1.mean\n" + == "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n return data1 + statistics.data1.mean\n" ) def test_transformation_function_definition_no_hopworks_udf(self): @@ -211,7 +218,7 @@ def test(col1): ) def test_transformation_function_definition_with_hopworks_udf(self): - @hopsworks_udf(int) + @udf(int) def test2(col1): return col1 + 1 From 659f2aba9e4b1f0331bf12564de959fede0bc682 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Fri, 7 Jun 2024 09:47:54 +0200 Subject: [PATCH 42/58] refactoring transformation functions to update parsing of statistics parameters and also renaming decorator name --- python/hsfs/hopsworks_udf.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index e287089545..96ef119866 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -280,7 +280,11 @@ def _parse_function_signature(source_code: str) -> Tuple[List[str], str, int, in ] ) arg_list = signature.split("(")[1].split(")")[0].split(",") - arg_list = [arg.split(":")[0].split("=")[0].strip() for arg in arg_list] + arg_list = [ + arg.split(":")[0].split("=")[0].strip() + for arg in arg_list + if not arg.strip() == "" + ] if "statistics" in arg_list: arg_list.remove("statistics") return arg_list, signature, signature_start_line, signature_end_line From 0a22fd79c2c5219c84473e1256a5d58a05c58109 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Fri, 7 Jun 2024 10:13:02 +0200 Subject: [PATCH 43/58] reformating with ruff --- python/hsfs/core/vector_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py index 9a882523b6..9d39d81e09 100755 --- a/python/hsfs/core/vector_server.py +++ b/python/hsfs/core/vector_server.py @@ -1084,4 +1084,4 @@ def transformed_feature_vector_col_name(self): if feature not in transformation_features ] self._transformed_feature_vector_col_name.extend(output_column_names) - return self._transformed_feature_vector_col_name \ No newline at end of file + return self._transformed_feature_vector_col_name From 159da54794fd90a5204399ccbb466309266d6306 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Sun, 9 Jun 2024 21:29:20 +0200 Subject: [PATCH 44/58] adding statistics to udf only if required --- python/hsfs/hopsworks_udf.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index 96ef119866..e3cd99de56 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -696,9 +696,10 @@ def transformation_statistics( ) -> None: self._statistics = TransformationStatistics(*self._statistics_argument_names) for stat in statistics: - self._statistics.set_statistics( - self._statistics_argument_mapping[stat.feature_name], stat.to_dict() - ) + if stat.feature_name in self._statistics_argument_mapping.keys(): + self._statistics.set_statistics( + self._statistics_argument_mapping[stat.feature_name], stat.to_dict() + ) @output_column_names.setter def output_column_names(self, output_col_names: Union[str, List[str]]) -> None: From eef2cb535f9a22a11bec020e7cab2b02adfb7899 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Sun, 9 Jun 2024 22:26:09 +0200 Subject: [PATCH 45/58] convrting extended statistics to dictonary --- python/hsfs/transformation_statistics.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/hsfs/transformation_statistics.py b/python/hsfs/transformation_statistics.py index f4b6b1c0e5..79778d1e33 100644 --- a/python/hsfs/transformation_statistics.py +++ b/python/hsfs/transformation_statistics.py @@ -16,6 +16,7 @@ from __future__ import annotations +import json from dataclasses import dataclass from typing import Any, Dict, Mapping, Optional, Union @@ -86,7 +87,11 @@ def __init__( self.entropy = entropy self.uniqueness = uniqueness self.exact_num_distinct_values = exact_num_distinct_values - self.extended_statistics = extended_statistics + self.extended_statistics = ( + extended_statistics + if not isinstance(extended_statistics, str) + else json.loads(extended_statistics) + ) @classmethod def from_response_json( From 50e944cf28009f9935ed6848e8cea272b2c24ae3 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Sun, 9 Jun 2024 22:31:45 +0200 Subject: [PATCH 46/58] sorting built in label encoder to maintain consistency --- python/hsfs/builtin_transformations.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/hsfs/builtin_transformations.py b/python/hsfs/builtin_transformations.py index 421a04cffe..9e2daa0d24 100644 --- a/python/hsfs/builtin_transformations.py +++ b/python/hsfs/builtin_transformations.py @@ -44,9 +44,9 @@ def robust_scaler(feature: pd.Series, statistics=feature_statistics) -> pd.Serie @udf(int) def label_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Series: - unique_data = [ - value for value in statistics.feature.extended_statistics["unique_values"] - ] + unique_data = sorted( + [value for value in statistics.feature.extended_statistics["unique_values"]] + ) value_to_index = {value: index for index, value in enumerate(unique_data)} return pd.Series( [value_to_index[data] if not pd.isna(data) else np.nan for data in feature] From 7111f86710972212e59d3cdfe6457274cda5d0b1 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Thu, 13 Jun 2024 16:37:49 +0200 Subject: [PATCH 47/58] adding type hints for class TransformationStatistics --- python/hsfs/transformation_statistics.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/hsfs/transformation_statistics.py b/python/hsfs/transformation_statistics.py index 79778d1e33..c4a1bc20b1 100644 --- a/python/hsfs/transformation_statistics.py +++ b/python/hsfs/transformation_statistics.py @@ -96,7 +96,7 @@ def __init__( @classmethod def from_response_json( cls: FeatureTransformationStatistics, json_dict: Dict[str, Any] - ): + ) -> FeatureTransformationStatistics: json_decamelized = humps.decamelize(json_dict) return cls(**json_decamelized) @@ -106,16 +106,16 @@ class TransformationStatistics: Class that stores statistics of all features required for a transformation function. """ - def __init__(self, *features): + def __init__(self, *features: str): self._features = features self.__dict__.update( {feature: self.init_statistics(feature) for feature in features} ) - def init_statistics(self, feature_name): + def init_statistics(self, feature_name: str) -> FeatureTransformationStatistics: return FeatureTransformationStatistics(feature_name=feature_name) - def set_statistics(self, feature_name, statistics: Dict[str, Any]): + def set_statistics(self, feature_name: str, statistics: Dict[str, Any]) -> None: self.__dict__[feature_name] = ( FeatureTransformationStatistics.from_response_json(statistics) ) From 114a792c4b7a104794619cd656b149272ded239d Mon Sep 17 00:00:00 2001 From: manu-sj Date: Fri, 14 Jun 2024 10:38:45 +0200 Subject: [PATCH 48/58] adapating to backend update of reaturning output_types, transformation_features and statistics_argument_names as Lists --- python/hsfs/hopsworks_udf.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index e3cd99de56..544746dc90 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -573,17 +573,15 @@ def from_response_json( function_source_code = json_decamelized["source_code"] function_name = json_decamelized["name"] output_types = [ - output_type.strip() - for output_type in json_decamelized["output_types"].split(",") + output_type.strip() for output_type in json_decamelized["output_types"] ] transformation_features = [ - feature.strip() - for feature in json_decamelized["transformation_features"].split(",") + feature.strip() for feature in json_decamelized["transformation_features"] ] statistics_features = ( [ feature.strip() - for feature in json_decamelized["statistics_argument_names"].split(",") + for feature in json_decamelized["statistics_argument_names"] ] if "statistics_argument_names" in json_decamelized else None From bd4bb1f515e8e25e93bd00391b3e3b92b5ae8c04 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Fri, 14 Jun 2024 11:27:41 +0200 Subject: [PATCH 49/58] fixing unit tests --- python/hsfs/hopsworks_udf.py | 4 ++ ...t_python_spark_transformation_functions.py | 18 ++++----- .../tests/fixtures/feature_view_fixtures.json | 20 +++++----- .../transformation_function_fixtures.json | 38 +++++++++---------- 4 files changed, 42 insertions(+), 38 deletions(-) diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index 544746dc90..b20465a17a 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -590,6 +590,10 @@ def from_response_json( # Reconstructing statistics arguments. arg_list, _, _, _ = HopsworksUdf._parse_function_signature(function_source_code) + transformation_features = ( + arg_list if not transformation_features else transformation_features + ) + if statistics_features: transformation_features = [ TransformationFeature( diff --git a/python/tests/engine/test_python_spark_transformation_functions.py b/python/tests/engine/test_python_spark_transformation_functions.py index cf0d529611..cb1a0652b5 100644 --- a/python/tests/engine/test_python_spark_transformation_functions.py +++ b/python/tests/engine/test_python_spark_transformation_functions.py @@ -157,9 +157,9 @@ def test_apply_builtin_minmax_from_backend(self, mocker): ) udf_response = { "sourceCode": tf_fun_source, - "outputTypes": "double", - "transformationFeatures": "", - "statisticsArgumentNames": "feature", + "outputTypes": ["double"], + "transformationFeatures": [], + "statisticsArgumentNames": ["feature"], "name": "min_max_scaler", } @@ -295,9 +295,9 @@ def test_apply_builtin_standard_scaler_from_backend(self, mocker): ) udf_response = { "sourceCode": tf_fun_source, - "outputTypes": "double", - "transformationFeatures": "", - "statisticsArgumentNames": "feature", + "outputTypes": ["double"], + "transformationFeatures": [], + "statisticsArgumentNames": ["feature"], "name": "standard_scaler", } @@ -437,9 +437,9 @@ def test_apply_builtin_robust_scaler_from_backend(self, mocker): ) udf_response = { "sourceCode": tf_fun_source, - "outputTypes": "double", - "transformationFeatures": "", - "statisticsArgumentNames": "feature", + "outputTypes": ["double"], + "transformationFeatures": [], + "statisticsArgumentNames": ["feature"], "name": "robust_scaler", } diff --git a/python/tests/fixtures/feature_view_fixtures.json b/python/tests/fixtures/feature_view_fixtures.json index a0a9f6864d..5e229955bd 100644 --- a/python/tests/fixtures/feature_view_fixtures.json +++ b/python/tests/fixtures/feature_view_fixtures.json @@ -692,9 +692,9 @@ "hopsworksUdf":{ "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n return data1 + statistics.data1.mean\n", "name": "add_mean_fs", - "outputTypes":"double", - "transformationFeatures":"data", - "statisticsArgumentNames":"data1" + "outputTypes":["double"], + "transformationFeatures":["data"], + "statisticsArgumentNames":["data1"] } }, { @@ -704,8 +704,8 @@ "hopsworksUdf":{ "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", "name": "add_one_fs", - "outputTypes":"double", - "transformationFeatures":"col1" + "outputTypes":["double"], + "transformationFeatures":["col1"] } } ], @@ -932,9 +932,9 @@ "hopsworksUdf":{ "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n return data1 + statistics.data1.mean\n", "name": "add_mean_fs", - "outputTypes":"double", - "transformationFeatures":"data", - "statisticsArgumentNames":"data1" + "outputTypes":["double"], + "transformationFeatures":["data"], + "statisticsArgumentNames":["data1"] } }, { @@ -944,8 +944,8 @@ "hopsworksUdf":{ "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", "name": "add_one_fs", - "outputTypes":"double", - "transformationFeatures":"col1" + "outputTypes":["double"], + "transformationFeatures":["col1"] } } ], diff --git a/python/tests/fixtures/transformation_function_fixtures.json b/python/tests/fixtures/transformation_function_fixtures.json index 96fac98fc8..6fa5d762b7 100644 --- a/python/tests/fixtures/transformation_function_fixtures.json +++ b/python/tests/fixtures/transformation_function_fixtures.json @@ -7,8 +7,8 @@ "hopsworksUdf":{ "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", "name": "add_one_fs", - "outputTypes":"double", - "transformationFeatures":"col1" + "outputTypes":["double"], + "transformationFeatures":["col1"] } } }, @@ -20,9 +20,9 @@ "hopsworksUdf":{ "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n return data1 + statistics.data1.mean\n", "name": "add_mean_fs", - "outputTypes":"double", - "transformationFeatures":"data", - "statisticsArgumentNames":"data1" + "outputTypes":["double"], + "transformationFeatures":["data"], + "statisticsArgumentNames":["data1"] } } }, @@ -34,9 +34,9 @@ "hopsworksUdf":{ "sourceCode": "\n@udf(str)\ndef test_func(data1 : pd.Series, data2, data3, statistics=stats):\n return data1 + statistics.data1.mean\n", "name": "test_func", - "outputTypes":"string", - "transformationFeatures":"feature1, feature2, feature3", - "statisticsArgumentNames":"data1, data2" + "outputTypes":["string"], + "transformationFeatures":["feature1", "feature2", "feature3"], + "statisticsArgumentNames":["data1", "data2"] } } }, @@ -48,9 +48,9 @@ "hopsworksUdf":{ "sourceCode": "\n@udf(str, float)\ndef test_func(data1 : pd.Series, data2, data3, statistics=stats):\n return pd.DataFrame('col1': ['a', 'b'], 'col2':[1,2])\n", "name": "test_func", - "outputTypes":"string, double", - "transformationFeatures":"feature1, feature2, feature3", - "statisticsArgumentNames":"data1, data2" + "outputTypes":["string", "double"], + "transformationFeatures":["feature1", "feature2", "feature3"], + "statisticsArgumentNames":["data1", "data2"] } } }, @@ -65,9 +65,9 @@ "hopsworksUdf":{ "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n return data1 + statistics.data1.mean\n", "name": "add_mean_fs", - "outputTypes":"double", - "transformationFeatures":"data", - "statisticsArgumentNames":"data1" + "outputTypes":["double"], + "transformationFeatures":["data"], + "statisticsArgumentNames":["data1"] } }, { @@ -77,8 +77,8 @@ "hopsworksUdf":{ "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", "name": "add_one_fs", - "outputTypes":"double", - "transformationFeatures":"col1" + "outputTypes":["double"], + "transformationFeatures":["col1"] } } ] @@ -95,9 +95,9 @@ "hopsworksUdf":{ "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n return data1 + statistics.data1.mean\n", "name": "add_mean_fs", - "outputTypes":"double", - "transformationFeatures":"data", - "statisticsArgumentNames":"data1" + "outputTypes":["double"], + "transformationFeatures":["data"], + "statisticsArgumentNames":["data1"] } } ] From 64f34cdd4e4d519740ffda4baabf88d13d5db477 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Mon, 17 Jun 2024 09:17:46 +0200 Subject: [PATCH 50/58] removign space in doc string --- python/hsfs/hopsworks_udf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index b20465a17a..83f6e1620f 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -44,9 +44,9 @@ def udf(return_type: Union[List[type], type]) -> "HopsworksUdf": !!! example ```python - from hsfs.hopsworks_udf import udf + from hopsworks import udf - @udf(float) + @udf(float) def add_one(data1 : pd.Series): return data1 + 1 ``` From 9891900452fbf6e78f6e12c09d1e11bfeaeadd7d Mon Sep 17 00:00:00 2001 From: manu-sj Date: Mon, 17 Jun 2024 10:53:42 +0200 Subject: [PATCH 51/58] replace - from output column names with _ --- python/hsfs/hopsworks_udf.py | 2 +- python/tests/engine/test_python.py | 12 ++++++------ python/tests/engine/test_spark.py | 30 +++++++++++++++--------------- python/tests/test_hopswork_udf.py | 10 +++++----- 4 files changed, 27 insertions(+), 27 deletions(-) diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index 83f6e1620f..b9f8bde5bb 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -366,7 +366,7 @@ def _get_output_column_names(self) -> str: `List[str]`: List of feature names for the transformed columns """ _BASE_COLUMN_NAME = ( - f'{self.function_name}_{"-".join(self.transformation_features)}_' + f'{self.function_name}_{"_".join(self.transformation_features)}_' ) if len(self.return_types) > 1: return [f"{_BASE_COLUMN_NAME}{i}" for i in range(len(self.return_types))] diff --git a/python/tests/engine/test_python.py b/python/tests/engine/test_python.py index 4796ad2cfe..07958686de 100644 --- a/python/tests/engine/test_python.py +++ b/python/tests/engine/test_python.py @@ -3354,12 +3354,12 @@ def plus_two(col1, col2): ) # Assert - assert all(result.columns == ["plus_two_col1-col2_0", "plus_two_col1-col2_1"]) + assert all(result.columns == ["plus_two_col1_col2_0", "plus_two_col1_col2_1"]) assert len(result) == 2 - assert result["plus_two_col1-col2_0"][0] == 2 - assert result["plus_two_col1-col2_0"][1] == 3 - assert result["plus_two_col1-col2_1"][0] == 12 - assert result["plus_two_col1-col2_1"][1] == 13 + assert result["plus_two_col1_col2_0"][0] == 2 + assert result["plus_two_col1_col2_0"][1] == 3 + assert result["plus_two_col1_col2_1"][0] == 12 + assert result["plus_two_col1_col2_1"][1] == 13 def test_apply_transformation_function_polars(self, mocker): # Arrange @@ -3854,7 +3854,7 @@ def test_materialization_kafka_first_job_execution(self, mocker): args="defaults tests_offsets", await_termination=False, ) - + def test_materialization_kafka_skip_offsets(self, mocker): # Arrange mocker.patch("hsfs.engine.python.Engine._get_kafka_config", return_value={}) diff --git a/python/tests/engine/test_spark.py b/python/tests/engine/test_spark.py index 42e0abe4e6..322716da5b 100644 --- a/python/tests/engine/test_spark.py +++ b/python/tests/engine/test_spark.py @@ -5,7 +5,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE_2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, @@ -937,7 +937,7 @@ def test_save_stream_dataframe(self, mocker, backend_fixtures): mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ 0 ][1] - == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}-checkpoint" + == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}_checkpoint" ) assert ( mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ @@ -1053,7 +1053,7 @@ def test_save_stream_dataframe_query_name(self, mocker, backend_fixtures): mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ 0 ][1] - == "/Projects/test_project_name/Resources/test_query_name-checkpoint" + == "/Projects/test_project_name/Resources/test_query_name_checkpoint" ) assert ( mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ @@ -1293,7 +1293,7 @@ def test_save_stream_dataframe_await_termination(self, mocker, backend_fixtures) mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ 0 ][1] - == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}-checkpoint" + == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}_checkpoint" ) assert ( mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ @@ -2456,7 +2456,7 @@ def test_time_series_split_date(self, mocker): d = { "col_0": [1, 2], "col_1": ["test_1", "test_2"], - "event_time": ["2017-03-04", "2017-03-05"], + "event_time": ["2017_03_04", "2017_03_05"], } df = pd.DataFrame(data=d) @@ -2516,7 +2516,7 @@ def test_time_series_split_timestamp(self, mocker): d = { "col_0": [1, 2], "col_1": ["test_1", "test_2"], - "event_time": ["2017-03-04", "2017-03-05"], + "event_time": ["2017_03_04", "2017_03_05"], } df = pd.DataFrame(data=d) @@ -3809,7 +3809,7 @@ def __init__(self, label, index): "double": ["1"], "timestamp": [1641340800000], "boolean": ["False"], - "date": ["2022-01-27"], + "date": ["2022_01_27"], "binary": ["1"], "array": [["123"]], "struc": [LabelIndex("0", "1")], @@ -4212,11 +4212,11 @@ def test_setup_s3_hadoop_conf(self, mocker): "fs.s3a.secret.key", s3_connector.secret_key ) mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call( - "fs.s3a.server-side-encryption-algorithm", + "fs.s3a.server_side_encryption_algorithm", s3_connector.server_encryption_algorithm, ) mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call( - "fs.s3a.server-side-encryption-key", s3_connector.server_encryption_key + "fs.s3a.server_side_encryption_key", s3_connector.server_encryption_key ) mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call( "fs.s3a.aws.credentials.provider", @@ -4487,8 +4487,8 @@ def test(col1, col2): expected_df = pd.DataFrame( data={ "col_1": ["test_1", "test_2"], - "test_col_0-col_2_0": [2, 3], - "test_col_0-col_2_1": [12, 13], + "test_col_0_col_2_0": [2, 3], + "test_col_0_col_2_1": [12, 13], } ) # todo why it doesnt return int? @@ -4514,7 +4514,7 @@ def test_setup_gcp_hadoop_conf(self, mocker): content = ( '{"type": "service_account", "project_id": "test", "private_key_id": "123456", ' - '"private_key": "-----BEGIN PRIVATE KEY-----test-----END PRIVATE KEY-----", ' + '"private_key": "_____BEGIN PRIVATE KEY_____test_____END PRIVATE KEY_____", ' '"client_email": "test@project.iam.gserviceaccount.com"}' ) credentialsFile = "keyFile.json" @@ -4563,7 +4563,7 @@ def test_setup_gcp_hadoop_conf(self, mocker): ) mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call( "fs.gs.auth.service.account.private.key", - "-----BEGIN PRIVATE KEY-----test-----END PRIVATE KEY-----", + "_____BEGIN PRIVATE KEY_____test_____END PRIVATE KEY_____", ) mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.unset.assert_any_call( "fs.gs.encryption.algorithm" @@ -4586,7 +4586,7 @@ def test_setup_gcp_hadoop_conf_algorithm(self, mocker): content = ( '{"type": "service_account", "project_id": "test", "private_key_id": "123456", ' - '"private_key": "-----BEGIN PRIVATE KEY-----test-----END PRIVATE KEY-----", ' + '"private_key": "_____BEGIN PRIVATE KEY_____test_____END PRIVATE KEY_____", ' '"client_email": "test@project.iam.gserviceaccount.com"}' ) credentialsFile = "keyFile.json" @@ -4650,7 +4650,7 @@ def test_setup_gcp_hadoop_conf_algorithm(self, mocker): ) mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call( "fs.gs.auth.service.account.private.key", - "-----BEGIN PRIVATE KEY-----test-----END PRIVATE KEY-----", + "_____BEGIN PRIVATE KEY_____test_____END PRIVATE KEY_____", ) def test_get_unique_values(self): diff --git a/python/tests/test_hopswork_udf.py b/python/tests/test_hopswork_udf.py index 402c1857e1..8494d018f1 100644 --- a/python/tests/test_hopswork_udf.py +++ b/python/tests/test_hopswork_udf.py @@ -337,7 +337,7 @@ def test_generate_output_column_names_multiple_argument_one_output_type(self): def test_func(col1, col2, col3): return col1 + 1 - assert test_func._get_output_column_names() == ["test_func_col1-col2-col3_"] + assert test_func._get_output_column_names() == ["test_func_col1_col2_col3_"] def test_generate_output_column_names_single_argument_multiple_output_type(self): @udf([int, float, int]) @@ -360,9 +360,9 @@ def test_func(col1, col2, col3): ) assert test_func._get_output_column_names() == [ - "test_func_col1-col2-col3_0", - "test_func_col1-col2-col3_1", - "test_func_col1-col2-col3_2", + "test_func_col1_col2_col3_0", + "test_func_col1_col2_col3_1", + "test_func_col1_col2_col3_2", ] def test_create_pandas_udf_return_schema_from_list_one_output_type(self): @@ -422,7 +422,7 @@ def test_func(col1, col2): test_dataframe["column1"], test_dataframe["column2"] ) - assert all(result.columns == ["test_func_col1-col2_0", "test_func_col1-col2_1"]) + assert all(result.columns == ["test_func_col1_col2_0", "test_func_col1_col2_1"]) assert result.values.tolist() == [[2, 12], [3, 22], [4, 32], [5, 42]] def test_HopsworkUDf_call_one_argument(self): From 6ebd9f41e2ff330c3c36353da6eb4c5b69d5d3e1 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Mon, 17 Jun 2024 11:03:11 +0200 Subject: [PATCH 52/58] revreting unwanted spark test _ replace changes --- python/tests/engine/test_spark.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/python/tests/engine/test_spark.py b/python/tests/engine/test_spark.py index 322716da5b..7eabd38d07 100644 --- a/python/tests/engine/test_spark.py +++ b/python/tests/engine/test_spark.py @@ -5,7 +5,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE_2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, @@ -937,7 +937,7 @@ def test_save_stream_dataframe(self, mocker, backend_fixtures): mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ 0 ][1] - == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}_checkpoint" + == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}-checkpoint" ) assert ( mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ @@ -1053,7 +1053,7 @@ def test_save_stream_dataframe_query_name(self, mocker, backend_fixtures): mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ 0 ][1] - == "/Projects/test_project_name/Resources/test_query_name_checkpoint" + == "/Projects/test_project_name/Resources/test_query_name-checkpoint" ) assert ( mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ @@ -1293,7 +1293,7 @@ def test_save_stream_dataframe_await_termination(self, mocker, backend_fixtures) mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ 0 ][1] - == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}_checkpoint" + == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}-checkpoint" ) assert ( mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ @@ -2456,7 +2456,7 @@ def test_time_series_split_date(self, mocker): d = { "col_0": [1, 2], "col_1": ["test_1", "test_2"], - "event_time": ["2017_03_04", "2017_03_05"], + "event_time": ["2017-03-04", "2017-03-05"], } df = pd.DataFrame(data=d) @@ -2516,7 +2516,7 @@ def test_time_series_split_timestamp(self, mocker): d = { "col_0": [1, 2], "col_1": ["test_1", "test_2"], - "event_time": ["2017_03_04", "2017_03_05"], + "event_time": ["2017-03-04", "2017-03-05"], } df = pd.DataFrame(data=d) @@ -3809,7 +3809,7 @@ def __init__(self, label, index): "double": ["1"], "timestamp": [1641340800000], "boolean": ["False"], - "date": ["2022_01_27"], + "date": ["2022-01-27"], "binary": ["1"], "array": [["123"]], "struc": [LabelIndex("0", "1")], @@ -4212,11 +4212,11 @@ def test_setup_s3_hadoop_conf(self, mocker): "fs.s3a.secret.key", s3_connector.secret_key ) mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call( - "fs.s3a.server_side_encryption_algorithm", + "fs.s3a.server-side-encryption-algorithm", s3_connector.server_encryption_algorithm, ) mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call( - "fs.s3a.server_side_encryption_key", s3_connector.server_encryption_key + "fs.s3a.server-side-encryption-key", s3_connector.server_encryption_key ) mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call( "fs.s3a.aws.credentials.provider", @@ -4514,7 +4514,7 @@ def test_setup_gcp_hadoop_conf(self, mocker): content = ( '{"type": "service_account", "project_id": "test", "private_key_id": "123456", ' - '"private_key": "_____BEGIN PRIVATE KEY_____test_____END PRIVATE KEY_____", ' + '"private_key": "-----BEGIN PRIVATE KEY-----test-----END PRIVATE KEY-----", ' '"client_email": "test@project.iam.gserviceaccount.com"}' ) credentialsFile = "keyFile.json" @@ -4563,7 +4563,7 @@ def test_setup_gcp_hadoop_conf(self, mocker): ) mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call( "fs.gs.auth.service.account.private.key", - "_____BEGIN PRIVATE KEY_____test_____END PRIVATE KEY_____", + "-----BEGIN PRIVATE KEY-----test-----END PRIVATE KEY-----", ) mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.unset.assert_any_call( "fs.gs.encryption.algorithm" @@ -4586,7 +4586,7 @@ def test_setup_gcp_hadoop_conf_algorithm(self, mocker): content = ( '{"type": "service_account", "project_id": "test", "private_key_id": "123456", ' - '"private_key": "_____BEGIN PRIVATE KEY_____test_____END PRIVATE KEY_____", ' + '"private_key": "-----BEGIN PRIVATE KEY-----test-----END PRIVATE KEY-----", ' '"client_email": "test@project.iam.gserviceaccount.com"}' ) credentialsFile = "keyFile.json" @@ -4650,7 +4650,7 @@ def test_setup_gcp_hadoop_conf_algorithm(self, mocker): ) mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call( "fs.gs.auth.service.account.private.key", - "_____BEGIN PRIVATE KEY_____test_____END PRIVATE KEY_____", + "-----BEGIN PRIVATE KEY-----test-----END PRIVATE KEY-----", ) def test_get_unique_values(self): From c0202101c913b3362327976ce3c70ff97ebe1108 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Mon, 1 Jul 2024 15:43:59 +0200 Subject: [PATCH 53/58] on-deamnd tranformations working --- python/hsfs/builtin_transformations.py | 10 +- python/hsfs/core/feature_group_api.py | 16 +- python/hsfs/core/feature_group_engine.py | 8 +- python/hsfs/core/feature_view_api.py | 24 +-- python/hsfs/core/feature_view_engine.py | 23 --- .../core/transformation_function_engine.py | 23 +-- python/hsfs/core/vector_server.py | 139 ++++++++++++-- python/hsfs/engine/python.py | 40 +++- python/hsfs/engine/spark.py | 49 ++++- python/hsfs/feature.py | 12 ++ python/hsfs/feature_group.py | 100 +++++++++- python/hsfs/feature_store.py | 10 + python/hsfs/feature_view.py | 21 ++- python/hsfs/hopsworks_udf.py | 177 ++++++++++++++++-- python/hsfs/training_dataset_feature.py | 26 ++- python/hsfs/transformation_function.py | 12 +- python/tests/test_hopswork_udf.py | 40 +++- 17 files changed, 589 insertions(+), 141 deletions(-) diff --git a/python/hsfs/builtin_transformations.py b/python/hsfs/builtin_transformations.py index 9e2daa0d24..ae24cd4274 100644 --- a/python/hsfs/builtin_transformations.py +++ b/python/hsfs/builtin_transformations.py @@ -23,26 +23,26 @@ feature_statistics = TransformationStatistics("feature") -@udf(float) +@udf(float, drop=["feature"]) def min_max_scaler(feature: pd.Series, statistics=feature_statistics) -> pd.Series: return (feature - statistics.feature.min) / ( statistics.feature.max - statistics.feature.min ) -@udf(float) +@udf(float, drop=["feature"]) def standard_scaler(feature: pd.Series, statistics=feature_statistics) -> pd.Series: return (feature - statistics.feature.mean) / statistics.feature.stddev -@udf(float) +@udf(float, drop=["feature"]) def robust_scaler(feature: pd.Series, statistics=feature_statistics) -> pd.Series: return (feature - statistics.feature.percentiles[49]) / ( statistics.feature.percentiles[74] - statistics.feature.percentiles[24] ) -@udf(int) +@udf(int, drop=["feature"]) def label_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Series: unique_data = sorted( [value for value in statistics.feature.extended_statistics["unique_values"]] @@ -53,7 +53,7 @@ def label_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Serie ) -@udf(bool) +@udf(bool, drop=["feature"]) def one_hot_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Series: unique_data = [ value for value in statistics.feature.extended_statistics["unique_values"] diff --git a/python/hsfs/core/feature_group_api.py b/python/hsfs/core/feature_group_api.py index 11fdbbbdc6..c6b0a1a70f 100644 --- a/python/hsfs/core/feature_group_api.py +++ b/python/hsfs/core/feature_group_api.py @@ -51,6 +51,9 @@ def save( feature_group_instance.feature_store_id, "featuregroups", ] + query_params = { + "expand": ["features", "expectationsuite", "transformationfunctions"] + } headers = {"content-type": "application/json"} feature_group_object = feature_group_instance.update_from_response_json( _client._send_request( @@ -58,6 +61,7 @@ def save( path_params, headers=headers, data=feature_group_instance.json(), + query_params=query_params, ), ) return feature_group_object @@ -93,7 +97,11 @@ def get( "featuregroups", name, ] - query_params = None if version is None else {"version": version} + query_params = { + "expand": ["features", "expectationsuite", "transformationfunctions"] + } + if version is not None: + query_params["version"] = version fg_objs = [] # In principle unique names are enforced across fg type and this should therefore @@ -157,8 +165,10 @@ def get_by_id( "featuregroups", feature_group_id, ] - - fg_json = _client._send_request("GET", path_params) + query_params = { + "expand": ["features", "expectationsuite", "transformationfunctions"] + } + fg_json = _client._send_request("GET", path_params, query_params) if ( fg_json["type"] == FeatureGroupApi.BACKEND_FG_STREAM or fg_json["type"] == FeatureGroupApi.BACKEND_FG_BATCH diff --git a/python/hsfs/core/feature_group_engine.py b/python/hsfs/core/feature_group_engine.py index 3e88805eda..010810f6cc 100644 --- a/python/hsfs/core/feature_group_engine.py +++ b/python/hsfs/core/feature_group_engine.py @@ -88,7 +88,9 @@ def insert( validation_options: dict = None, ): dataframe_features = engine.get_instance().parse_schema_feature_group( - feature_dataframe, feature_group.time_travel_format + feature_dataframe, + feature_group.time_travel_format, + feature_group.transformation_functions, ) util.validate_embedding_feature_type( feature_group.embedding_index, dataframe_features @@ -281,7 +283,9 @@ def insert_stream( ) dataframe_features = engine.get_instance().parse_schema_feature_group( - dataframe, feature_group.time_travel_format + dataframe, + feature_group.time_travel_format, + feature_group.transformation_functions, ) util.validate_embedding_feature_type( feature_group.embedding_index, dataframe_features diff --git a/python/hsfs/core/feature_view_api.py b/python/hsfs/core/feature_view_api.py index 1bc6b46115..50355f3d5f 100644 --- a/python/hsfs/core/feature_view_api.py +++ b/python/hsfs/core/feature_view_api.py @@ -17,7 +17,7 @@ from typing import List, Optional, Union -from hsfs import client, feature_view, training_dataset, transformation_function +from hsfs import client, feature_view, training_dataset from hsfs.client.exceptions import RestAPIError from hsfs.constructor import query, serving_prepared_statement from hsfs.core import explicit_provenance, job, training_dataset_job_conf @@ -206,28 +206,6 @@ def get_serving_prepared_statement( self._client._send_request("GET", path, query_params, headers=headers) ) - def get_attached_transformation_fn( - self, name: str, version: int - ) -> List["transformation_function.TransformationFunction"]: - """ - Get transformation functions attached to a feature view form the backend - - # Arguments - name `str`: Name of feature view. - version `ìnt`: Version of feature view. - - # Returns - `List[TransformationFunction]` : List of transformation functions attached to the feature view. - - # Raises - `RestAPIError`: If the feature view cannot be found from the backend. - `ValueError`: If the feature group associated with the feature view cannot be found. - """ - path = self._base_path + [name, self._VERSION, version, self._TRANSFORMATION] - return transformation_function.TransformationFunction.from_response_json( - self._client._send_request("GET", path) - ) - def create_training_dataset( self, name: str, diff --git a/python/hsfs/core/feature_view_engine.py b/python/hsfs/core/feature_view_engine.py index 070be9b821..f85529163f 100644 --- a/python/hsfs/core/feature_view_engine.py +++ b/python/hsfs/core/feature_view_engine.py @@ -25,7 +25,6 @@ feature_group, feature_view, training_dataset_feature, - transformation_function, util, ) from hsfs.client import exceptions @@ -265,28 +264,6 @@ def get_batch_query_string( return fs_query.pit_query return fs_query.query - def get_attached_transformation_fn( - self, name: str, version: int - ) -> List[transformation_function.TransformationFunction]: - """ - Get transformation functions attached to a feature view form the backend - - # Arguments - name `str`: Name of feature view. - version `ìnt`: Version of feature view. - - # Returns - `List[TransformationFunction]` : List of transformation functions attached to the feature view. - - # Raises - `RestAPIError`: If the feature view cannot be found from the backend. - `ValueError`: If the feature group associated with the feature view cannot be found. - """ - transformation_functions = ( - self._feature_view_api.get_attached_transformation_fn(name, version) - ) - return transformation_functions - def create_training_dataset( self, feature_view_obj, diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py index ec5de0810b..6bdbff13c9 100644 --- a/python/hsfs/core/transformation_function_engine.py +++ b/python/hsfs/core/transformation_function_engine.py @@ -147,21 +147,12 @@ def get_ready_to_use_transformation_fns( feature_view: feature_view.FeatureView, training_dataset_version: Optional[int] = None, ) -> List[transformation_function.TransformationFunction]: - # get attached transformation functions - transformation_functions = ( - feature_view._feature_view_engine.get_attached_transformation_fn( - feature_view.name, feature_view.version - ) - ) - - transformation_functions = ( - [transformation_functions] - if not isinstance(transformation_functions, list) - else transformation_functions - ) - + # check if transformation functions require statistics is_stat_required = any( - [tf.hopsworks_udf.statistics_required for tf in transformation_functions] + [ + tf.hopsworks_udf.statistics_required + for tf in feature_view.transformation_functions + ] ) if not is_stat_required: td_tffn_stats = None @@ -188,11 +179,11 @@ def get_ready_to_use_transformation_fns( ) if is_stat_required: - for transformation_function in transformation_functions: + for transformation_function in feature_view.transformation_functions: transformation_function.hopsworks_udf.transformation_statistics = ( td_tffn_stats.feature_descriptive_statistics ) - return feature_view._sort_transformation_functions(transformation_functions) + return feature_view.transformation_functions @staticmethod def compute_and_set_feature_statistics( diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py index 9d39d81e09..403cbb2522 100755 --- a/python/hsfs/core/vector_server.py +++ b/python/hsfs/core/vector_server.py @@ -107,7 +107,10 @@ def __init__( self._transformation_function_engine = ( tf_engine_mod.TransformationFunctionEngine(feature_store_id) ) - self._transformation_functions: List[ + self._model_dependent_transformation_functions: List[ + transformation_function.TransformationFunction + ] = [] + self._on_demand_transformation_functions: List[ transformation_function.TransformationFunction ] = [] self._sql_client = None @@ -183,13 +186,23 @@ def init_batch_scoring( def init_transformation( self, - entity: Union[feature_view.FeatureView], + entity: feature_view.FeatureView, ): # attach transformation functions - self._transformation_functions = tf_engine_mod.TransformationFunctionEngine.get_ready_to_use_transformation_fns( + self._model_dependent_transformation_functions = tf_engine_mod.TransformationFunctionEngine.get_ready_to_use_transformation_fns( entity, self._training_dataset_version, ) + self._on_demand_transformation_functions = [ + feature.on_demand_transformation_function + for feature in entity.features + if feature.on_demand_transformation_function + ] + self._on_demand_feature_names = [ + feature.name + for feature in entity.features + if feature.on_demand_transformation_function + ] def setup_sql_client( self, @@ -242,6 +255,7 @@ def get_feature_vector( allow_missing: bool = False, force_rest_client: bool = False, force_sql_client: bool = False, + request_parameters: Optional[Dict[str, Any]] = None, ) -> Union[pd.DataFrame, pl.DataFrame, np.ndarray, List[Any], Dict[str, Any]]: """Assembles serving vector from online feature store.""" online_client_choice = self.which_client_and_ensure_initialised( @@ -273,8 +287,8 @@ def get_feature_vector( vector_db_result=vector_db_features or {}, allow_missing=allow_missing, client=online_client_choice, + request_parameters=request_parameters, ) - return self.handle_feature_vector_return_type( vector, batch=False, inference_helper=False, return_type=return_type ) @@ -287,6 +301,7 @@ def get_feature_vectors( ] = None, passed_features: Optional[List[Dict[str, Any]]] = None, vector_db_features: Optional[List[Dict[str, Any]]] = None, + request_parameters: Optional[List[Dict[str, Any]]] = None, allow_missing: bool = False, force_rest_client: bool = False, force_sql_client: bool = False, @@ -305,6 +320,12 @@ def get_feature_vectors( or len(vector_db_features) == 0 or len(vector_db_features) == len(entries) ), "Vector DB features should be None, empty or have the same length as the entries" + assert ( + request_parameters is None + or len(request_parameters) == 0 + or isinstance(request_parameters, dict) + or len(request_parameters) == len(entries) + ), "Request Parameters should be a Dictionary, None, empty or have the same length as the entries" online_client_choice = self.which_client_and_ensure_initialised( force_rest_client=force_rest_client, force_sql_client=force_sql_client @@ -347,14 +368,23 @@ def get_feature_vectors( skipped_empty_entries.pop(0) if len(skipped_empty_entries) > 0 else None ) vectors = [] + + # If request parameter is a dictionary then copy it to list with the same length as that of entires + request_parameters = ( + [request_parameters] * len(entries) + if isinstance(request_parameters, dict) + else request_parameters + ) for ( idx, passed_values, vector_db_result, + request_parameter, ) in itertools.zip_longest( range(len(entries)), passed_features or [], vector_db_features or [], + request_parameters or [], fillvalue=None, ): if next_skipped == idx: @@ -374,6 +404,7 @@ def get_feature_vectors( vector_db_result=vector_db_result, allow_missing=allow_missing, client=online_client_choice, + request_parameters=request_parameter, ) if vector is not None: @@ -390,6 +421,7 @@ def assemble_feature_vector( vector_db_result: Optional[Dict[str, Any]], allow_missing: bool, client: Literal["rest", "sql"], + request_parameters: Optional[Dict[str, Any]] = None, ) -> Optional[List[Any]]: """Assembles serving vector from online feature store.""" # Errors in batch requests are returned as None values @@ -404,9 +436,52 @@ def assemble_feature_vector( _logger.debug("Updating with passed features: %s", passed_values) result_dict.update(passed_values) - missing_features = set(self.feature_vector_col_name).difference( - result_dict.keys() + missing_features = ( + set(self.feature_vector_col_name) + .difference(result_dict.keys()) + .difference(self._on_demand_feature_names) ) + + # TODO : Optimize this + request_parameters = {} if not request_parameters else request_parameters + available_parameters = set((result_dict | request_parameters).keys()) + missing_request_parameters_features = {} + + for on_demand_feature, on_demand_transformation in zip( + self._on_demand_feature_names, self._on_demand_transformation_functions + ): + missing_request_parameter = ( + set(on_demand_transformation.hopsworks_udf.transformation_features) + - available_parameters + ) + if missing_request_parameter: + missing_request_parameters_features[on_demand_feature] = sorted( + list( + set( + on_demand_transformation.hopsworks_udf.transformation_features + ) + - available_parameters + ) + ) + + if missing_request_parameters_features: + error = "Missing Request parameters to compute the following the on-demand Features:\n" + for ( + feature, + missing_request_parameter, + ) in missing_request_parameters_features.items(): + missing_request_parameter = "', '".join(missing_request_parameter) + error += f"On-Demand Feature '{feature}' requires features '{missing_request_parameter}'\n" + error += ( + "Possible reasons: " + "1. There is no match in the given entry." + " Please check if the entry exists in the online feature store" + " or provide the feature as passed_feature. " + f"2. Required entries [{', '.join(self.required_serving_keys)}] or " + f"[{', '.join(set(sk.feature_name for sk in self._serving_keys))}] are not provided." + ) + raise exceptions.FeatureStoreException(error) + # for backward compatibility, before 3.4, if result is empty, # instead of throwing error, it skips the result # Maybe we drop this behaviour for 4.0 @@ -426,8 +501,11 @@ def assemble_feature_vector( if len(self.return_feature_value_handlers) > 0: self.apply_return_value_handlers(result_dict, client=client) - if len(self.transformation_functions) > 0: - self.apply_transformation(result_dict) + if ( + len(self.model_dependent_transformation_functions) > 0 + or len(self.on_demand_transformation_functions) > 0 + ): + self.apply_transformation(result_dict, request_parameters) _logger.debug("Assembled and transformed dict feature vector: %s", result_dict) @@ -473,17 +551,19 @@ def handle_feature_vector_return_type( return pd.DataFrame([feature_vectorz]) elif batch: return pd.DataFrame( - feature_vectorz, columns=self._feature_vector_col_name + feature_vectorz, columns=self.transformed_feature_vector_col_name ) else: pandas_df = pd.DataFrame(feature_vectorz).transpose() - pandas_df.columns = self._feature_vector_col_name + pandas_df.columns = self.transformed_feature_vector_col_name return pandas_df elif return_type.lower() == "polars": _logger.debug("Returning feature vector as polars dataframe") return pl.DataFrame( feature_vectorz if batch else [feature_vectorz], - schema=self._feature_vector_col_name if not inference_helper else None, + schema=self.transformed_feature_vector_col_name + if not inference_helper + else None, orient="row", ) else: @@ -630,9 +710,24 @@ def _set_default_client( self.default_client = self.DEFAULT_SQL_CLIENT self._init_sql_client = True - def apply_transformation(self, row_dict: dict): - _logger.debug("Applying transformation functions.") - for tf in self.transformation_functions: + def apply_transformation(self, row_dict: dict, request_parameter: Dict[str, Any]): + _logger.debug("Applying On-Demand transformation functions.") + for tf in self._on_demand_transformation_functions: + # Check if feature provided as request parameter if not get it from retrieved feature vector. + features = [ + pd.Series(request_parameter[feature]) + if feature in request_parameter.keys() + else pd.Series(row_dict[feature]) + for feature in tf.hopsworks_udf.transformation_features + ] + on_demand_feature = tf.hopsworks_udf.get_udf(force_python_udf=True)( + *features + ) # Get only python compatible UDF irrespective of engine + + row_dict[on_demand_feature.name] = on_demand_feature.values[0] + + _logger.debug("Applying Model-Dependent transformation functions.") + for tf in self.model_dependent_transformation_functions: features = [ pd.Series(row_dict[feature]) for feature in tf.hopsworks_udf.transformation_features @@ -995,10 +1090,16 @@ def per_serving_key_features(self) -> Dict[str, set[str]]: return self._per_serving_key_features @property - def transformation_functions( + def model_dependent_transformation_functions( + self, + ) -> Optional[List[transformation_function.TransformationFunction]]: + return self._model_dependent_transformation_functions + + @property + def on_demand_transformation_functions( self, - ) -> Optional[List[transformation_functions.TransformationFunction]]: - return self._transformation_functions + ) -> Optional[List[transformation_function.TransformationFunction]]: + return self._on_demand_transformation_functions @property def return_feature_value_handlers(self) -> Dict[str, Callable]: @@ -1070,7 +1171,9 @@ def transformed_feature_vector_col_name(self): if self._transformed_feature_vector_col_name is None: transformation_features = [] output_column_names = [] - for transformation_function in self._transformation_functions: + for ( + transformation_function + ) in self._model_dependent_transformation_functions: transformation_features += ( transformation_function.hopsworks_udf.transformation_features ) diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py index cc50428632..fea3dd0301 100644 --- a/python/hsfs/engine/python.py +++ b/python/hsfs/engine/python.py @@ -804,6 +804,9 @@ def parse_schema_feature_group( self, dataframe: Union[pd.DataFrame, pl.DataFrame], time_travel_format: Optional[str] = None, + transformation_functions: Optional[ + List[transformation_function.TransformationFunction] + ] = None, ) -> List[feature.Feature]: if isinstance(dataframe, pd.DataFrame): arrow_schema = pa.Schema.from_pandas(dataframe, preserve_index=False) @@ -812,6 +815,19 @@ def parse_schema_feature_group( ): arrow_schema = dataframe.to_arrow().schema features = [] + transformed_features = [] + dropped_features = [] + + if transformation_functions: + for tf in transformation_functions: + transformed_features.append( + feature.Feature( + tf.hopsworks_udf.output_column_names[0], + tf.hopsworks_udf.return_types[0], + on_demand=True, + ) + ) + dropped_features.extend(tf.hopsworks_udf.dropped_features) for feat_name in arrow_schema.names: name = util.autofix_feature_name(feat_name) try: @@ -820,8 +836,10 @@ def parse_schema_feature_group( ) except ValueError as e: raise FeatureStoreException(f"Feature '{name}': {str(e)}") from e - features.append(feature.Feature(name, converted_type)) - return features + if name not in dropped_features: + features.append(feature.Feature(name, converted_type)) + + return features + transformed_features def parse_schema_training_dataset( self, dataframe: Union[pd.DataFrame, pl.DataFrame] @@ -842,6 +860,11 @@ def save_dataframe( online_write_options: Dict[str, Any], validation_id: Optional[int] = None, ) -> Optional[job.Job]: + if feature_group.transformation_functions: + dataframe = self._apply_transformation_function( + feature_group.transformation_functions, dataframe + ) + if ( isinstance(feature_group, ExternalFeatureGroup) and feature_group.online_enabled @@ -1319,7 +1342,7 @@ def _apply_transformation_function( # Raises `FeatureStoreException`: If any of the features mentioned in the transformation function is not present in the Feature View. """ - transformed_features = set() + dropped_features = set() if isinstance(dataset, pl.DataFrame) or isinstance( dataset, pl.dataframe.frame.DataFrame @@ -1342,7 +1365,7 @@ def _apply_transformation_function( f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly." ) - transformed_features.update(tf.hopsworks_udf.transformation_features) + dropped_features.update(tf.hopsworks_udf.dropped_features) dataset = pd.concat( [ dataset, @@ -1357,7 +1380,7 @@ def _apply_transformation_function( ], axis=1, ) - dataset = dataset.drop(transformed_features, axis=1) + dataset = dataset.drop(dropped_features, axis=1) return dataset @staticmethod @@ -1536,8 +1559,11 @@ def acked(err: Exception, msg: Any) -> None: elif not isinstance( feature_group, ExternalFeatureGroup ) and self._start_offline_materialization(offline_write_options): - if (not offline_write_options.get("skip_offsets", False) - and self._job_api.last_execution(feature_group.materialization_job)): # always skip offsets if executing job for the first time + if not offline_write_options.get( + "skip_offsets", False + ) and self._job_api.last_execution( + feature_group.materialization_job + ): # always skip offsets if executing job for the first time # don't provide the current offsets (read from where the job last left off) initial_check_point = "" # provide the initial_check_point as it will reduce the read amplification of materialization job diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py index a22be38cc0..60f5f14854 100644 --- a/python/hsfs/engine/spark.py +++ b/python/hsfs/engine/spark.py @@ -351,6 +351,10 @@ def save_dataframe( validation_id=None, ): try: + if feature_group.transformation_functions: + dataframe = self._apply_transformation_function( + feature_group.transformation_functions, dataframe + ) if ( isinstance(feature_group, fg_mod.ExternalFeatureGroup) and feature_group.online_enabled @@ -395,6 +399,11 @@ def save_stream_dataframe( checkpoint_dir, write_options, ): + if feature_group.transformation_functions: + dataframe = self._apply_transformation_function( + feature_group.transformation_functions, dataframe + ) + write_options = self._get_kafka_config( feature_group.feature_store_id, write_options ) @@ -1115,8 +1124,29 @@ def read_options(self, data_format, provided_options): options.update(provided_options) return options - def parse_schema_feature_group(self, dataframe, time_travel_format=None): + def parse_schema_feature_group( + self, + dataframe, + time_travel_format=None, + transformation_functions: Optional[ + List[transformation_function.TransformationFunction] + ] = None, + ): features = [] + transformed_features = [] + dropped_features = [] + + if transformation_functions: + for tf in transformation_functions: + transformed_features.append( + feature.Feature( + tf.hopsworks_udf.output_column_names[0], + tf.hopsworks_udf.return_types[0], + on_demand=True, + ) + ) + dropped_features.extend(tf.hopsworks_udf.dropped_features) + using_hudi = time_travel_format == "HUDI" for feat in dataframe.schema: name = util.autofix_feature_name(feat.name) @@ -1126,12 +1156,13 @@ def parse_schema_feature_group(self, dataframe, time_travel_format=None): ) except ValueError as e: raise FeatureStoreException(f"Feature '{feat.name}': {str(e)}") from e - features.append( - feature.Feature( - name, converted_type, feat.metadata.get("description", None) + if name not in dropped_features: + features.append( + feature.Feature( + name, converted_type, feat.metadata.get("description", None) + ) ) - ) - return features + return features + transformed_features def parse_schema_training_dataset(self, dataframe): return [ @@ -1244,7 +1275,7 @@ def _apply_transformation_function( # Raises `FeatureStoreException`: If any of the features mentioned in the transformation function is not present in the Feature View. """ - transformed_features = set() + dropped_features = set() transformations = [] transformation_features = [] output_col_names = [] @@ -1260,7 +1291,7 @@ def _apply_transformation_function( f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly." ) - transformed_features.update(tf.hopsworks_udf.transformation_features) + dropped_features.update(tf.hopsworks_udf.dropped_features) pandas_udf = hopsworks_udf.get_udf() output_col_name = hopsworks_udf.output_column_names[0] @@ -1276,7 +1307,7 @@ def _apply_transformation_function( untransformed_columns = [] # Untransformed column maintained as a list since order is imported while selecting features. for column in dataset.columns: - if column not in transformed_features: + if column not in dropped_features: untransformed_columns.append(column) # Applying transformations transformed_dataset = dataset.select( diff --git a/python/hsfs/feature.py b/python/hsfs/feature.py index 89f19b060d..412929a75e 100644 --- a/python/hsfs/feature.py +++ b/python/hsfs/feature.py @@ -53,6 +53,7 @@ def __init__( "hsfs.feature_group.SpineGroup", ] ] = None, + on_demand: bool = False, **kwargs, ) -> None: self._name = util.autofix_feature_name(name) @@ -67,6 +68,7 @@ def __init__( self._feature_group_id = feature_group.id else: self._feature_group_id = feature_group_id + self._on_demand = on_demand def to_dict(self) -> Dict[str, Any]: """Get structured info about specific Feature in python dictionary format. @@ -93,6 +95,7 @@ def to_dict(self) -> Dict[str, Any]: "onlineType": self._online_type, "defaultValue": self._default_value, "featureGroupId": self._feature_group_id, + "onDemand": self.on_demand, } def json(self) -> str: @@ -206,6 +209,15 @@ def default_value(self, default_value: Optional[str]) -> None: def feature_group_id(self) -> Optional[int]: return self._feature_group_id + @property + def on_demand(self) -> bool: + """Whether the feature is a on-demand feature computed using on-demand transformation functions""" + return self._on_demand + + @on_demand.setter + def on_demand(self, on_demand) -> None: + self._on_demand = on_demand + def __lt__(self, other: Any) -> "filter.Filter": return filter.Filter(self, filter.Filter.LT, other) diff --git a/python/hsfs/feature_group.py b/python/hsfs/feature_group.py index de5577417c..0bbeb26552 100644 --- a/python/hsfs/feature_group.py +++ b/python/hsfs/feature_group.py @@ -73,8 +73,10 @@ from hsfs.embedding import EmbeddingIndex from hsfs.expectation_suite import ExpectationSuite from hsfs.ge_validation_result import ValidationResult +from hsfs.hopsworks_udf import HopsworksUdf, UDFType from hsfs.statistics import Statistics from hsfs.statistics_config import StatisticsConfig +from hsfs.transformation_function import TransformationFunction from hsfs.validation_report import ValidationReport @@ -543,8 +545,13 @@ def get_storage_connector(self): """ storage_connector_provenance = self.get_storage_connector_provenance() - if storage_connector_provenance.inaccessible or storage_connector_provenance.deleted: - _logger.info("The parent storage connector is deleted or inaccessible. For more details access `get_storage_connector_provenance`") + if ( + storage_connector_provenance.inaccessible + or storage_connector_provenance.deleted + ): + _logger.info( + "The parent storage connector is deleted or inaccessible. For more details access `get_storage_connector_provenance`" + ) if storage_connector_provenance.accessible: return storage_connector_provenance.accessible[0] @@ -2022,6 +2029,9 @@ def __init__( Union[Dict[str, Any], "deltastreamer_jobconf.DeltaStreamerJobConf"] ] = None, deprecated: bool = False, + transformation_functions: Optional[ + List[Union[TransformationFunction, HopsworksUdf]] + ] = None, **kwargs, ) -> None: super().__init__( @@ -2124,6 +2134,44 @@ def __init__( self._feature_writers: Optional[Dict[str, callable]] = None self._writer: Optional[callable] = None + # On-Demand Transformation Functions + self._transformation_functions: List[TransformationFunction] = ( + [ + TransformationFunction( + featurestore_id, + hopsworks_udf=transformation_function, + version=1, + transformation_type=UDFType.ON_DEMAND, + ) + if not isinstance(transformation_function, TransformationFunction) + else transformation_function + for transformation_function in transformation_functions + ] + if transformation_functions + else [] + ) + + if self._transformation_functions: + self._transformation_functions = ( + FeatureGroup._sort_transformation_functions( + self._transformation_functions + ) + ) + + @staticmethod + def _sort_transformation_functions( + transformation_functions: List[TransformationFunction], + ) -> List[TransformationFunction]: + """ + Function that sorts transformation functions in the order of the output column names. + The list of transformation functions are sorted based on the output columns names to maintain consistent ordering. + # Arguments + transformation_functions: `List[TransformationFunction]`. List of transformation functions to be sorted + # Returns + `List[TransformationFunction]`: List of transformation functions to be sorted + """ + return sorted(transformation_functions, key=lambda x: x.output_column_names[0]) + def read( self, wallclock_time: Optional[Union[str, int, datetime, date]] = None, @@ -3204,6 +3252,17 @@ def from_response_json( json_decamelized["embedding_index"] = EmbeddingIndex.from_response_json( json_decamelized["embedding_index"] ) + if "transformation_functions" in json_decamelized: + transformation_functions = json_decamelized["transformation_functions"] + json_decamelized["transformation_functions"] = [ + TransformationFunction.from_response_json( + { + **transformation_function, + "transformation_type": UDFType.ON_DEMAND, + } + ) + for transformation_function in transformation_functions + ] return cls(**json_decamelized) for fg in json_decamelized: if "type" in fg: @@ -3214,6 +3273,17 @@ def from_response_json( fg["embedding_index"] = EmbeddingIndex.from_response_json( fg["embedding_index"] ) + if "transformation_functions" in fg: + transformation_functions = fg["transformation_functions"] + fg["transformation_functions"] = [ + TransformationFunction.from_response_json( + { + **transformation_function, + "transformation_type": UDFType.ON_DEMAND, + } + ) + for transformation_function in transformation_functions + ] return [cls(**fg) for fg in json_decamelized] def update_from_response_json(self, json_dict: Dict[str, Any]) -> "FeatureGroup": @@ -3224,6 +3294,17 @@ def update_from_response_json(self, json_dict: Dict[str, Any]) -> "FeatureGroup" json_decamelized["embedding_index"] = EmbeddingIndex.from_response_json( json_decamelized["embedding_index"] ) + if "transformation_functions" in json_decamelized: + transformation_functions = json_decamelized["transformation_functions"] + json_decamelized["transformation_functions"] = [ + TransformationFunction.from_response_json( + { + **transformation_function, + "transformation_type": UDFType.ON_DEMAND, + } + ) + for transformation_function in transformation_functions + ] self.__init__(**json_decamelized) return self @@ -3270,6 +3351,7 @@ def to_dict(self) -> Dict[str, Any]: "topicName": self.topic_name, "notificationTopicName": self.notification_topic_name, "deprecated": self.deprecated, + "transformationFunctions": self._transformation_functions, } if self.embedding_index: fg_meta_dict["embeddingIndex"] = self.embedding_index.to_dict() @@ -3376,6 +3458,13 @@ def statistics(self) -> "Statistics": ) return super().statistics + @property + def transformation_functions( + self, + ) -> List[TransformationFunction]: + """Get transformation functions.""" + return self._transformation_functions + @description.setter def description(self, new_description: Optional[str]) -> None: self._description = new_description @@ -3402,6 +3491,13 @@ def stream(self, stream: bool) -> None: def parents(self, new_parents: "explicit_provenance.Links") -> None: self._parents = new_parents + @transformation_functions.setter + def transformation_functions( + self, + transformation_functions: List[TransformationFunction], + ) -> None: + self._transformation_functions = transformation_functions + @typechecked class ExternalFeatureGroup(FeatureGroupBase): diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py index 11eeac1983..4da096d80c 100644 --- a/python/hsfs/feature_store.py +++ b/python/hsfs/feature_store.py @@ -510,6 +510,9 @@ def create_feature_group( parents: Optional[List[feature_group.FeatureGroup]] = None, topic_name: Optional[str] = None, notification_topic_name: Optional[str] = None, + transformation_functions: Optional[ + List[Union[TransformationFunction, HopsworksUdf]] + ] = None, ) -> "feature_group.FeatureGroup": """Create a feature group metadata object. @@ -592,6 +595,7 @@ def create_feature_group( defaults to using project topic. notification_topic_name: Optionally, define the name of the topic used for sending notifications when entries are inserted or updated on the online feature store. If left undefined no notifications are sent. + transformation_functions: A list of Hopsworks UDF's. Defaults to `None`, no transformations. # Returns `FeatureGroup`. The feature group metadata object. @@ -616,6 +620,7 @@ def create_feature_group( parents=parents or [], topic_name=topic_name, notification_topic_name=notification_topic_name, + transformation_functions=transformation_functions, ) feature_group_object.feature_store = self return feature_group_object @@ -642,6 +647,9 @@ def get_or_create_feature_group( parents: Optional[List[feature_group.FeatureGroup]] = None, topic_name: Optional[str] = None, notification_topic_name: Optional[str] = None, + transformation_functions: Optional[ + List[Union[TransformationFunction, HopsworksUdf]] + ] = None, ) -> Union[ "feature_group.FeatureGroup", "feature_group.ExternalFeatureGroup", @@ -726,6 +734,7 @@ def get_or_create_feature_group( defaults to using project topic. notification_topic_name: Optionally, define the name of the topic used for sending notifications when entries are inserted or updated on the online feature store. If left undefined no notifications are sent. + transformation_functions: A list of Hopsworks UDF's. Defaults to `None`, no transformations. # Returns `FeatureGroup`. The feature group metadata object. @@ -759,6 +768,7 @@ def get_or_create_feature_group( parents=parents or [], topic_name=topic_name, notification_topic_name=notification_topic_name, + transformation_functions=transformation_functions, ) feature_group_object.feature_store = self return feature_group_object diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py index 9ca317a473..f2f5019160 100644 --- a/python/hsfs/feature_view.py +++ b/python/hsfs/feature_view.py @@ -54,7 +54,7 @@ from hsfs.core.vector_db_client import VectorDbClient from hsfs.decorators import typechecked from hsfs.feature import Feature -from hsfs.hopsworks_udf import HopsworksUdf +from hsfs.hopsworks_udf import HopsworksUdf, UDFType from hsfs.statistics import Statistics from hsfs.statistics_config import StatisticsConfig from hsfs.training_dataset_split import TrainingDatasetSplit @@ -126,6 +126,7 @@ def __init__( self.featurestore_id, hopsworks_udf=transformation_function, version=1, + transformation_type=UDFType.MODEL_DEPENDENT, ) if not isinstance(transformation_function, TransformationFunction) else transformation_function @@ -493,6 +494,7 @@ def get_feature_vector( allow_missing: bool = False, force_rest_client: bool = False, force_sql_client: bool = False, + request_parameters: Optional[Dict[str, Any]] = None, ) -> Union[List[Any], pd.DataFrame, np.ndarray, pl.DataFrame]: """Returns assembled feature vector from online feature store. Call [`feature_view.init_serving`](#init_serving) before this method if the following configurations are needed. @@ -566,6 +568,7 @@ def get_feature_vector( force_sql_client: boolean, defaults to False. If set to True, reads from online feature store using the SQL client if initialised. allow_missing: Setting to `True` returns feature vectors with missing values. + request_parameters: Request parameters required by on-demand transformation functions. # Returns `list`, `pd.DataFrame`, `polars.DataFrame` or `np.ndarray` if `return type` is set to `"list"`, `"pandas"`, `"polars"` or `"numpy"` @@ -591,6 +594,7 @@ def get_feature_vector( vector_db_features=vector_db_features, force_rest_client=force_rest_client, force_sql_client=force_sql_client, + request_parameters=request_parameters, ) def get_feature_vectors( @@ -602,6 +606,7 @@ def get_feature_vectors( allow_missing: bool = False, force_rest_client: bool = False, force_sql_client: bool = False, + request_parameters: Optional[List[Dict[str, Any]]] = None, ) -> Union[List[List[Any]], pd.DataFrame, np.ndarray, pl.DataFrame]: """Returns assembled feature vectors in batches from online feature store. Call [`feature_view.init_serving`](#init_serving) before this method if the following configurations are needed. @@ -700,6 +705,7 @@ def get_feature_vectors( vector_db_features=vector_db_features, force_rest_client=force_rest_client, force_sql_client=force_sql_client, + request_parameters=request_parameters, ) def get_inference_helper( @@ -853,6 +859,10 @@ def find_neighbors( the number of results returned may be less than k. Try using a large value of k and extract the top k items from the results if needed. + !!! warning "Duplicate column error in Polars" + If the feature view has duplicate column names, attempting to create a polars DataFrame + will raise an error. To avoid this, set `return_type` to `"list"` or `"pandas"`. + # Arguments embedding: The target embedding for which neighbors are to be found. feature: The feature used to compute similarity score. Required only if there @@ -1024,7 +1034,7 @@ def get_batch_data( start_time, end_time, self._batch_scoring_server.training_dataset_version, - self._batch_scoring_server._transformation_functions, + self._batch_scoring_server._model_dependent_transformation_functions, read_options, spine, primary_keys, @@ -3442,7 +3452,12 @@ def from_response_json(cls, json_dict: Dict[str, Any]) -> "FeatureView": featurestore_name=json_decamelized.get("featurestore_name", None), serving_keys=serving_keys, transformation_functions=[ - TransformationFunction.from_response_json(transformation_function) + TransformationFunction.from_response_json( + { + **transformation_function, + "transformation_type": UDFType.MODEL_DEPENDENT, + } + ) for transformation_function in transformation_functions ] if transformation_functions diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index b9f8bde5bb..0a005134a6 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -21,6 +21,7 @@ import warnings from dataclasses import dataclass from datetime import date, datetime, time +from enum import Enum from typing import Any, Callable, Dict, List, Optional, Tuple, Union import humps @@ -31,7 +32,14 @@ from hsfs.transformation_statistics import TransformationStatistics -def udf(return_type: Union[List[type], type]) -> "HopsworksUdf": +class UDFType(Enum): + MODEL_DEPENDENT = "model_dependent" + ON_DEMAND = "on_demand" + + +def udf( + return_type: Union[List[type], type], drop: Optional[Union[str, List[str]]] = None +) -> "HopsworksUdf": """ Create an User Defined Function that can be and used within the Hopsworks Feature Store. @@ -46,13 +54,14 @@ def udf(return_type: Union[List[type], type]) -> "HopsworksUdf": ```python from hopsworks import udf - @udf(float) + @udf(float) def add_one(data1 : pd.Series): return data1 + 1 ``` # Arguments return_type: `list`. The output types of the defined UDF + drop: `List[str]`. The features to be dropped after application of transformation functions # Returns `HopsworksUdf`: The metadata object for hopsworks UDF's. @@ -62,7 +71,7 @@ def add_one(data1 : pd.Series): """ def wrapper(func: Callable) -> HopsworksUdf: - udf = HopsworksUdf(func=func, return_types=return_type) + udf = HopsworksUdf(func=func, return_types=return_type, dropped_features=drop) return udf return wrapper @@ -127,11 +136,17 @@ def __init__( return_types: Union[List[type], type, List[str], str], name: Optional[str] = None, transformation_features: Optional[List[TransformationFeature]] = None, + dropped_features: Optional[List[str]] = None, + feature_name_prefix: Optional[str] = None, ): self._return_types: List[str] = HopsworksUdf._validate_and_convert_output_types( return_types ) + self._feature_name_prefix: Optional[str] = ( + feature_name_prefix # Prefix to be added to feature names + ) + self._function_name: str = func.__name__ if name is None else name self._function_source: str = ( @@ -152,9 +167,55 @@ def __init__( HopsworksUdf._format_source_code(self._function_source) ) + self._dropped_features: List[str] = ( + HopsworksUdf._validate_and_convert_drop_features( + dropped_features, self.transformation_features, feature_name_prefix + ) + ) + self._statistics: Optional[TransformationStatistics] = None - self._output_column_names: List[str] = self._get_output_column_names() + self._udf_type: UDFType = None + + self._output_column_names: List[str] = [] + + @staticmethod + def _validate_and_convert_drop_features( + dropped_features: Union[str, List[str]], + transformation_feature: List[str], + feature_name_prefix: str, + ) -> List[str]: + """ + Function that converts dropped features to a list and validates if the dropped feature is present in the transformation function + # Arguments + dropped_features: `Union[str, List[str]]`. Features of be dropped. + transformation_feature: `List[str]`. Features to be transformed in the UDF + # Returns + `List[str]`: A list of features to be dropped. + """ + if not dropped_features: + return [] + + dropped_features = ( + [dropped_features] + if not isinstance(dropped_features, list) + else dropped_features + ) + + feature_name_prefix = feature_name_prefix if feature_name_prefix else "" + + missing_drop_features = [] + for dropped_feature in dropped_features: + if feature_name_prefix + dropped_feature not in transformation_feature: + missing_drop_features.append(dropped_feature) + + if missing_drop_features: + missing_drop_features = "', '".join(missing_drop_features) + raise FeatureStoreException( + f"Cannot drop features '{missing_drop_features}' as they are not features given as arguments in the defined UDF." + ) + + return dropped_features @staticmethod def _validate_and_convert_output_types( @@ -365,13 +426,18 @@ def _get_output_column_names(self) -> str: # Returns `List[str]`: List of feature names for the transformed columns """ - _BASE_COLUMN_NAME = ( - f'{self.function_name}_{"_".join(self.transformation_features)}_' - ) - if len(self.return_types) > 1: - return [f"{_BASE_COLUMN_NAME}{i}" for i in range(len(self.return_types))] - else: - return [f"{_BASE_COLUMN_NAME}"] + if self._udf_type == UDFType.MODEL_DEPENDENT: + _BASE_COLUMN_NAME = ( + f'{self.function_name}_{"-".join(self.transformation_features)}_' + ) + if len(self.return_types) > 1: + return [ + f"{_BASE_COLUMN_NAME}{i}" for i in range(len(self.return_types)) + ] + else: + return [f"{_BASE_COLUMN_NAME}"] + elif self._udf_type == UDFType.ON_DEMAND: + return [self.function_name] def _create_pandas_udf_return_schema_from_list(self) -> str: """ @@ -479,6 +545,13 @@ def __call__(self, *features: List[str]) -> "HopsworksUdf": raise FeatureStoreException( f'Feature names provided must be string "{arg}" is not string' ) + transformation_feature_name = self.transformation_features + index_dropped_features = [ + transformation_feature_name.index(dropped_feature) + for dropped_feature in self.dropped_features + ] + updated_dropped_features = [features[index] for index in index_dropped_features] + # Create a copy of the UDF to associate it with new feature names. udf = copy.deepcopy(self) @@ -491,6 +564,7 @@ def __call__(self, *features: List[str]) -> "HopsworksUdf": ) ] udf.output_column_names = udf._get_output_column_names() + udf.dropped_features = updated_dropped_features return udf def update_return_type_one_hot(self): @@ -541,10 +615,12 @@ def to_dict(self) -> Dict[str, Any]: "sourceCode": self._function_source, "outputTypes": self.return_types, "transformationFeatures": self.transformation_features, + "droppedFeatures": self.dropped_features, "statisticsArgumentNames": self._statistics_argument_names if self.statistics_required else None, "name": self._function_name, + "featureNamePrefix": self._feature_name_prefix, } def json(self) -> str: @@ -572,12 +648,17 @@ def from_response_json( json_decamelized = humps.decamelize(json_dict) function_source_code = json_decamelized["source_code"] function_name = json_decamelized["name"] + feature_name_prefix = json_decamelized.get("feature_name_prefix", None) output_types = [ output_type.strip() for output_type in json_decamelized["output_types"] ] transformation_features = [ feature.strip() for feature in json_decamelized["transformation_features"] ] + dropped_features = [ + dropped_feature.strip() + for dropped_feature in json_decamelized["dropped_features"] + ] statistics_features = ( [ feature.strip() @@ -590,10 +671,6 @@ def from_response_json( # Reconstructing statistics arguments. arg_list, _, _, _ = HopsworksUdf._parse_function_signature(function_source_code) - transformation_features = ( - arg_list if not transformation_features else transformation_features - ) - if statistics_features: transformation_features = [ TransformationFeature( @@ -615,11 +692,28 @@ def from_response_json( return_types=output_types, name=function_name, transformation_features=transformation_features, + dropped_features=dropped_features, + feature_name_prefix=feature_name_prefix, ) # Set transformation features if already set. return hopsworks_udf + def _validate_udf_type(self): + if self.udf_type is None: + raise FeatureStoreException("UDF Type cannot be None") + + if self._udf_type == UDFType.ON_DEMAND: + if len(self.return_types) > 1: + raise FeatureStoreException( + "On-Demand Transformation functions can only return one column as output" + ) + + if self.statistics_required: + raise FeatureStoreException( + "On-Demand Transformation functions cannot use statistics, please remove statistics parameters from the functions" + ) + @property def return_types(self) -> List[str]: """Get the output types of the UDF""" @@ -648,17 +742,30 @@ def transformation_statistics( @property def output_column_names(self) -> List[str]: """Output columns names of the transformation function""" - return self._output_column_names + if self._feature_name_prefix: + return [ + self._feature_name_prefix + output_col_name + for output_col_name in self._output_column_names + ] + else: + return self._output_column_names @property def transformation_features(self) -> List[str]: """ List of feature names to be used in the User Defined Function. """ - return [ - transformation_feature.feature_name - for transformation_feature in self._transformation_features - ] + if self._feature_name_prefix: + return [ + self._feature_name_prefix + transformation_feature.feature_name + for transformation_feature in self._transformation_features + ] + + else: + return [ + transformation_feature.feature_name + for transformation_feature in self._transformation_features + ] @property def statistics_features(self) -> List[str]: @@ -692,6 +799,33 @@ def _statistics_argument_names(self) -> List[str]: if transformation_feature.statistic_argument_name is not None ] + @property + def udf_type(self) -> UDFType: + """Type of the UDF : Can be \"model dependent\" or \"on-demand\" """ + return self._udf_type + + @udf_type.setter + def udf_type(self, udf_type: UDFType) -> None: + self._udf_type = udf_type + self._validate_udf_type() + self._output_column_names = self._get_output_column_names() + + @property + def dropped_features(self) -> List[str]: + if self._feature_name_prefix: + return [ + self._feature_name_prefix + dropped_feature + for dropped_feature in self._dropped_features + ] + else: + return self._dropped_features + + @dropped_features.setter + def dropped_features(self, features: List[str]) -> None: + self._dropped_features = HopsworksUdf._validate_and_convert_drop_features( + features, self.transformation_features, self._feature_name_prefix + ) + @transformation_statistics.setter def transformation_statistics( self, statistics: List[FeatureDescriptiveStatistics] @@ -713,3 +847,6 @@ def output_column_names(self, output_col_names: Union[str, List[str]]) -> None: ) else: self._output_column_names = output_col_names + + def __repr__(self): + return f'{self.function_name}({", ".join(self.transformation_features)})' diff --git a/python/hsfs/training_dataset_feature.py b/python/hsfs/training_dataset_feature.py index a06637abe2..3aa3f6a81f 100644 --- a/python/hsfs/training_dataset_feature.py +++ b/python/hsfs/training_dataset_feature.py @@ -15,10 +15,14 @@ # from __future__ import annotations +from typing import Optional + import humps from hsfs import feature as feature_mod from hsfs import feature_group as feature_group_mod from hsfs import util +from hsfs.hopsworks_udf import UDFType +from hsfs.transformation_function import TransformationFunction class TrainingDatasetFeature: @@ -32,6 +36,7 @@ def __init__( label=False, inference_helper_column=False, training_helper_column=False, + transformation_function: Optional[TransformationFunction] = None, **kwargs, ): self._name = util.autofix_feature_name(name) @@ -47,6 +52,10 @@ def __init__( self._inference_helper_column = inference_helper_column self._training_helper_column = training_helper_column + self._on_demand_transformation_function: Optional[TransformationFunction] = ( + transformation_function if transformation_function else None + ) + def to_dict(self): return { "name": self._name, @@ -57,11 +66,21 @@ def to_dict(self): "trainingHelperColumn": self._training_helper_column, "featureGroupFeatureName": self._feature_group_feature_name, "featuregroup": self._feature_group, + "transformation_function": self._on_demand_transformation_function, } @classmethod def from_response_json(cls, json_dict): json_decamelized = humps.decamelize(json_dict) + if json_decamelized.get("transformation_function", False): + json_decamelized["transformation_function"]["transformation_type"] = ( + UDFType.ON_DEMAND + ) + json_decamelized["transformation_function"] = ( + TransformationFunction.from_response_json( + json_decamelized.get("transformation_function") + ) + ) return cls(**json_decamelized) def is_complex(self): @@ -110,6 +129,11 @@ def inference_helper_column(self): def inference_helper_column(self, inference_helper_column): self._inference_helper_column = inference_helper_column + @property + def on_demand_transformation_function(self) -> TransformationFunction: + """Whether the feature is a on-demand feature computed using on-demand transformation functions""" + return self._on_demand_transformation_function + @property def training_helper_column(self): """Indicator if it is feature.""" @@ -128,4 +152,4 @@ def feature_group_feature_name(self): return self._feature_group_feature_name def __repr__(self): - return f"Training Dataset Feature({self._name!r}, {self._type!r}, {self._index!r}, {self._label}, {self._feature_group_feature_name}, {self._feature_group.id!r})" + return f"Training Dataset Feature({self._name!r}, {self._type!r}, {self._index!r}, {self._label}, {self._feature_group_feature_name}, {self._feature_group.id!r}, {self.on_demand_transformation_function})" diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py index a3f6a295d7..65535aa539 100644 --- a/python/hsfs/transformation_function.py +++ b/python/hsfs/transformation_function.py @@ -23,7 +23,7 @@ from hsfs.client.exceptions import FeatureStoreException from hsfs.core import transformation_function_engine from hsfs.decorators import typechecked -from hsfs.hopsworks_udf import HopsworksUdf +from hsfs.hopsworks_udf import HopsworksUdf, UDFType @typechecked @@ -44,6 +44,7 @@ def __init__( hopsworks_udf: HopsworksUdf, version: Optional[int] = None, id: Optional[int] = None, + transformation_type: Optional[UDFType] = None, type=None, items=None, count=None, @@ -65,6 +66,7 @@ def __init__( ) self._hopsworks_udf: HopsworksUdf = hopsworks_udf + self._hopsworks_udf.udf_type = transformation_type def save(self) -> None: """Save a transformation function into the backend. @@ -233,3 +235,11 @@ def hopsworks_udf(self) -> HopsworksUdf: def output_column_names(self) -> List[str]: """Output column names of transformation functions""" return self._hopsworks_udf._output_column_names + + def __repr__(self): + if self.hopsworks_udf._udf_type == UDFType.MODEL_DEPENDENT: + return ( + f"Model-Dependent Transformation Function : {repr(self.hopsworks_udf)}" + ) + else: + return f"On-Demand Transformation Function : {repr(self.hopsworks_udf)}" diff --git a/python/tests/test_hopswork_udf.py b/python/tests/test_hopswork_udf.py index 8494d018f1..6595207ed3 100644 --- a/python/tests/test_hopswork_udf.py +++ b/python/tests/test_hopswork_udf.py @@ -19,7 +19,7 @@ import pandas as pd import pytest from hsfs.client.exceptions import FeatureStoreException -from hsfs.hopsworks_udf import HopsworksUdf, TransformationFeature, udf +from hsfs.hopsworks_udf import HopsworksUdf, TransformationFeature, UDFType, udf class TestHopsworksUdf: @@ -330,14 +330,21 @@ def test_generate_output_column_names_one_argument_one_output_type(self): def test_func(col1): return col1 + 1 + test_func.udf_type = UDFType.MODEL_DEPENDENT assert test_func._get_output_column_names() == ["test_func_col1_"] + test_func.udf_type = UDFType.ON_DEMAND + assert test_func._get_output_column_names() == ["test_func"] + def test_generate_output_column_names_multiple_argument_one_output_type(self): @udf(int) def test_func(col1, col2, col3): return col1 + 1 - assert test_func._get_output_column_names() == ["test_func_col1_col2_col3_"] + test_func.udf_type = UDFType.MODEL_DEPENDENT + assert test_func._get_output_column_names() == ["test_func_col1-col2-col3_"] + test_func.udf_type = UDFType.ON_DEMAND + assert test_func._get_output_column_names() == ["test_func"] def test_generate_output_column_names_single_argument_multiple_output_type(self): @udf([int, float, int]) @@ -346,6 +353,7 @@ def test_func(col1): {"col1": [col1 + 1], "col2": [col1 + 1], "col3": [col1 + 1]} ) + test_func.udf_type = UDFType.MODEL_DEPENDENT assert test_func._get_output_column_names() == [ "test_func_col1_0", "test_func_col1_1", @@ -359,10 +367,11 @@ def test_func(col1, col2, col3): {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]} ) + test_func.udf_type = UDFType.MODEL_DEPENDENT assert test_func._get_output_column_names() == [ - "test_func_col1_col2_col3_0", - "test_func_col1_col2_col3_1", - "test_func_col1_col2_col3_2", + "test_func_col1-col2-col3_0", + "test_func_col1-col2-col3_1", + "test_func_col1-col2-col3_2", ] def test_create_pandas_udf_return_schema_from_list_one_output_type(self): @@ -388,30 +397,45 @@ def test_func(col1): } ) + test_func.udf_type = UDFType.MODEL_DEPENDENT + assert ( test_func._create_pandas_udf_return_schema_from_list() == "`test_func_col1_0` bigint, `test_func_col1_1` double, `test_func_col1_2` string, `test_func_col1_3` date, `test_func_col1_4` timestamp, `test_func_col1_5` timestamp, `test_func_col1_6` boolean" ) def test_hopsworks_wrapper_single_output(self): + test_dataframe = pd.DataFrame({"col1": [1, 2, 3, 4]}) + @udf(int) def test_func(col1): return col1 + 1 - renaming_wrapper_function = test_func.hopsworksUdf_wrapper() + test_func.udf_type = UDFType.MODEL_DEPENDENT - test_dataframe = pd.DataFrame({"col1": [1, 2, 3, 4]}) + renaming_wrapper_function = test_func.hopsworksUdf_wrapper() result = renaming_wrapper_function(test_dataframe["col1"]) assert result.name == "test_func_col1_" assert result.values.tolist() == [2, 3, 4, 5] + test_func.udf_type = UDFType.ON_DEMAND + + renaming_wrapper_function = test_func.hopsworksUdf_wrapper() + + result = renaming_wrapper_function(test_dataframe["col1"]) + + assert result.name == "test_func" + assert result.values.tolist() == [2, 3, 4, 5] + def test_hopsworks_wrapper_multiple_output(self): @udf([int, float]) def test_func(col1, col2): return pd.DataFrame({"out1": col1 + 1, "out2": col2 + 2}) + test_func.udf_type = UDFType.MODEL_DEPENDENT + renaming_wrapper_function = test_func.hopsworksUdf_wrapper() test_dataframe = pd.DataFrame( @@ -422,7 +446,7 @@ def test_func(col1, col2): test_dataframe["column1"], test_dataframe["column2"] ) - assert all(result.columns == ["test_func_col1_col2_0", "test_func_col1_col2_1"]) + assert all(result.columns == ["test_func_col1-col2_0", "test_func_col1-col2_1"]) assert result.values.tolist() == [[2, 12], [3, 22], [4, 32], [5, 42]] def test_HopsworkUDf_call_one_argument(self): From e87331eadea4535a0c7bf6715cded130b9aff4a2 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Wed, 3 Jul 2024 23:28:20 +0200 Subject: [PATCH 54/58] fixing unit tests --- python/hsfs/core/training_dataset_engine.py | 19 +---- python/hsfs/engine/python.py | 2 +- python/hsfs/hopsworks_udf.py | 24 ++++-- python/hsfs/training_dataset.py | 19 ----- python/tests/core/test_feature_view_engine.py | 76 ----------------- .../core/test_training_dataset_engine.py | 72 +---------------- .../test_transformation_function_engine.py | 12 ++- python/tests/engine/test_python.py | 10 ++- ...t_python_spark_transformation_functions.py | 79 ++++++++++++------ python/tests/engine/test_spark.py | 19 +++-- .../tests/fixtures/feature_view_fixtures.json | 6 +- .../fixtures/training_dataset_fixtures.json | 12 +-- .../transformation_function_fixtures.json | 21 +++-- python/tests/test_builtin_transformation.py | 81 +++++++++++++++++++ python/tests/test_hopswork_udf.py | 10 +-- python/tests/test_training_dataset.py | 2 - python/tests/test_transformation_function.py | 13 ++- 17 files changed, 221 insertions(+), 256 deletions(-) create mode 100644 python/tests/test_builtin_transformation.py diff --git a/python/hsfs/core/training_dataset_engine.py b/python/hsfs/core/training_dataset_engine.py index 8d47adf165..34907ce3ca 100644 --- a/python/hsfs/core/training_dataset_engine.py +++ b/python/hsfs/core/training_dataset_engine.py @@ -22,7 +22,6 @@ from hsfs.core import ( tags_api, training_dataset_api, - transformation_function_engine, ) @@ -38,11 +37,6 @@ def __init__(self, feature_store_id): feature_store_id ) self._tags_api = tags_api.TagsApi(feature_store_id, self.ENTITY_TYPE) - self._transformation_function_engine = ( - transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - ) def save(self, training_dataset, features, user_write_options): if isinstance(features, query.Query): @@ -53,9 +47,6 @@ def save(self, training_dataset, features, user_write_options): ) for label_name in training_dataset.label ] - self._transformation_function_engine.attach_transformation_fn( - training_dataset - ) else: features = engine.get_instance().convert_to_default_dataframe(features) training_dataset._features = ( @@ -66,19 +57,11 @@ def save(self, training_dataset, features, user_write_options): if feature.name == label_name: feature.label = True - # check if user provided transformation functions and throw error as transformation functions work only - # with query objects - if training_dataset.transformation_functions: - raise ValueError( - "Transformation functions can only be applied to training datasets generated from Query object" - ) - if len(training_dataset.splits) > 0 and training_dataset.train_split is None: training_dataset.train_split = "train" warnings.warn( "Training dataset splits were defined but no `train_split` (the name of the split that is going to be " - "used for training) was provided. Setting this property to `train`. The statistics of this " - "split will be used for transformation functions.", + "used for training) was provided. Setting this property to `train`. ", stacklevel=1, ) diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py index fea3dd0301..9c2a4ca279 100644 --- a/python/hsfs/engine/python.py +++ b/python/hsfs/engine/python.py @@ -966,7 +966,7 @@ def get_training_data( # training_dataset_obj, feature_view_obj, training_dataset_version # ) return self._apply_transformation_function( - training_dataset_obj.transformation_functions, df + feature_view_obj.transformation_functions, df ) def split_labels( diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index 0a005134a6..a17e432009 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -428,7 +428,7 @@ def _get_output_column_names(self) -> str: """ if self._udf_type == UDFType.MODEL_DEPENDENT: _BASE_COLUMN_NAME = ( - f'{self.function_name}_{"-".join(self.transformation_features)}_' + f'{self.function_name}_{"_".join(self.transformation_features)}_' ) if len(self.return_types) > 1: return [ @@ -655,10 +655,14 @@ def from_response_json( transformation_features = [ feature.strip() for feature in json_decamelized["transformation_features"] ] - dropped_features = [ - dropped_feature.strip() - for dropped_feature in json_decamelized["dropped_features"] - ] + dropped_features = ( + [ + dropped_feature.strip() + for dropped_feature in json_decamelized["dropped_features"] + ] + if "dropped_features" in json_decamelized + else None + ) statistics_features = ( [ feature.strip() @@ -671,6 +675,16 @@ def from_response_json( # Reconstructing statistics arguments. arg_list, _, _, _ = HopsworksUdf._parse_function_signature(function_source_code) + transformation_features = ( + arg_list if not transformation_features else transformation_features + ) + + if dropped_features: + dropped_features = [ + transformation_features[arg_list.index(dropped_feature)] + for dropped_feature in dropped_features + ] + if statistics_features: transformation_features = [ TransformationFeature( diff --git a/python/hsfs/training_dataset.py b/python/hsfs/training_dataset.py index 5f51044546..f19b95e037 100644 --- a/python/hsfs/training_dataset.py +++ b/python/hsfs/training_dataset.py @@ -29,7 +29,6 @@ statistics_engine, training_dataset_api, training_dataset_engine, - transformation_function_engine, vector_server, ) from hsfs.statistics_config import StatisticsConfig @@ -538,7 +537,6 @@ def __init__( from_query=None, querydto=None, label=None, - transformation_functions=None, train_split=None, time_split_size=None, extra_filter=None, @@ -580,7 +578,6 @@ def __init__( self._querydto = querydto self._feature_store_id = featurestore_id self._feature_store_name = featurestore_name - self._transformation_functions = transformation_functions self._training_dataset_api = training_dataset_api.TrainingDatasetApi( featurestore_id @@ -592,9 +589,6 @@ def __init__( featurestore_id, self.ENTITY_TYPE ) self._code_engine = code_engine.CodeEngine(featurestore_id, self.ENTITY_TYPE) - self._transformation_function_engine = ( - transformation_function_engine.TransformationFunctionEngine(featurestore_id) - ) self._vector_server = vector_server.VectorServer( featurestore_id, features=self._features ) @@ -1084,19 +1078,6 @@ def feature_store_name(self) -> str: """Name of the feature store in which the feature group is located.""" return self._feature_store_name - @property - def transformation_functions(self): - """Set transformation functions.""" - if self._id is not None and self._transformation_functions is None: - self._transformation_functions = ( - self._transformation_function_engine.get_td_transformation_fn(self) - ) - return self._transformation_functions - - @transformation_functions.setter - def transformation_functions(self, transformation_functions): - self._transformation_functions = transformation_functions - @property def serving_keys(self) -> Set[str]: """Set of primary key names that is used as keys in input dict object for `get_serving_vector` method.""" diff --git a/python/tests/core/test_feature_view_engine.py b/python/tests/core/test_feature_view_engine.py index f6a141fb20..f1c3f7ab3d 100644 --- a/python/tests/core/test_feature_view_engine.py +++ b/python/tests/core/test_feature_view_engine.py @@ -29,9 +29,7 @@ from hsfs.constructor.query import Query from hsfs.core import arrow_flight_client, feature_view_engine from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics -from hsfs.hopsworks_udf import udf from hsfs.storage_connector import BigQueryConnector, StorageConnector -from hsfs.transformation_function import TransformationFunction engine.init("python") @@ -349,9 +347,6 @@ def test_get_name(self, mocker): feature_store_id = 99 mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi") - mocker.patch( - "hsfs.core.feature_view_engine.FeatureViewEngine.get_attached_transformation_fn" - ) fv_engine = feature_view_engine.FeatureViewEngine( feature_store_id=feature_store_id @@ -387,9 +382,6 @@ def test_get_name_version(self, mocker): feature_store_id = 99 mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi") - mocker.patch( - "hsfs.core.feature_view_engine.FeatureViewEngine.get_attached_transformation_fn" - ) fv_engine = feature_view_engine.FeatureViewEngine( feature_store_id=feature_store_id @@ -555,74 +547,6 @@ def test_get_batch_query_string_pit_query(self, mocker): assert mock_fv_api.return_value.get_batch_query.call_count == 1 assert mock_qc_api.return_value.construct_query.call_count == 1 - def test_get_attached_transformation_fn(self, mocker): - # Arrange - feature_store_id = 99 - - mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi") - - fv_engine = feature_view_engine.FeatureViewEngine( - feature_store_id=feature_store_id - ) - - @udf(int) - def test2(col1): - return col1 + 1 - - tf = TransformationFunction( - featurestore_id=10, - hopsworks_udf=test2, - ) - - mock_fv_api.return_value.get_attached_transformation_fn.return_value = [tf] - - # Act - result = fv_engine.get_attached_transformation_fn(name="fv_name", version=1) - - # Assert - assert result == [tf] - assert mock_fv_api.return_value.get_attached_transformation_fn.call_count == 1 - - def test_get_attached_transformation_fn_multiple(self, mocker): - # Arrange - feature_store_id = 99 - - mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi") - - fv_engine = feature_view_engine.FeatureViewEngine( - feature_store_id=feature_store_id - ) - - @udf(int) - def test1(col1): - return col1 + 1 - - tf1 = TransformationFunction( - featurestore_id=10, - hopsworks_udf=test1, - ) - - @udf(int) - def test2(col1): - return col1 + 2 - - tf2 = TransformationFunction( - featurestore_id=10, - hopsworks_udf=test2, - ) - - mock_fv_api.return_value.get_attached_transformation_fn.return_value = [ - tf1, - tf2, - ] - - # Act - result = fv_engine.get_attached_transformation_fn(name="fv_name", version=1) - - # Assert - assert result == [tf1, tf2] - assert mock_fv_api.return_value.get_attached_transformation_fn.call_count == 1 - def test_create_training_dataset(self, mocker): # Arrange feature_store_id = 99 diff --git a/python/tests/core/test_training_dataset_engine.py b/python/tests/core/test_training_dataset_engine.py index fea3d43f88..c1a55ca00a 100644 --- a/python/tests/core/test_training_dataset_engine.py +++ b/python/tests/core/test_training_dataset_engine.py @@ -14,16 +14,13 @@ # limitations under the License. # -import pytest from hsfs import ( feature_group, training_dataset, training_dataset_feature, - transformation_function, ) from hsfs.constructor import query from hsfs.core import training_dataset_engine -from hsfs.hopsworks_udf import udf class TestTrainingDatasetEngine: @@ -32,9 +29,6 @@ def test_save(self, mocker): feature_store_id = 99 mocker.patch("hsfs.client.get_instance") - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine" - ) mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") mock_td_api = mocker.patch("hsfs.core.training_dataset_api.TrainingDatasetApi") @@ -76,9 +70,6 @@ def test_save_query(self, mocker, backend_fixtures): mocker.patch("hsfs.client.get_instance") mocker.patch("hsfs.engine.get_type") - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine" - ) mocker.patch("hsfs.engine.get_instance") mock_td_api = mocker.patch("hsfs.core.training_dataset_api.TrainingDatasetApi") @@ -107,70 +98,12 @@ def test_save_query(self, mocker, backend_fixtures): assert td._features[0].label is True assert td._features[1].label is True - def test_save_transformation_functions(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch("hsfs.client.get_instance") - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine" - ) - mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") - mock_td_api = mocker.patch("hsfs.core.training_dataset_api.TrainingDatasetApi") - - @udf(int) - def plus_one(a): - return a + 1 - - tf = transformation_function.TransformationFunction( - hopsworks_udf=plus_one, featurestore_id=99 - ) - - td = training_dataset.TrainingDataset( - name="test", - version=1, - data_format="CSV", - featurestore_id=feature_store_id, - splits={}, - label=["f", "f_wrong"], - transformation_functions=tf, - ) - - td_engine = training_dataset_engine.TrainingDatasetEngine(feature_store_id) - - f = training_dataset_feature.TrainingDatasetFeature( - name="f", type="str", label=False - ) - f1 = training_dataset_feature.TrainingDatasetFeature( - name="f1", type="int", label=False - ) - - features = [f, f1] - - mock_engine_get_instance.return_value.parse_schema_training_dataset.return_value = features - - # Act - with pytest.raises(ValueError) as e_info: - td_engine.save(training_dataset=td, features=None, user_write_options=None) - - # Assert - assert mock_td_api.return_value.post.call_count == 0 - assert len(td._features) == 2 - assert td._features[0].label is True - assert td._features[1].label is False - assert ( - str(e_info.value) - == "Transformation functions can only be applied to training datasets generated from Query object" - ) - def test_save_splits(self, mocker): # Arrange feature_store_id = 99 mocker.patch("hsfs.client.get_instance") - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine" - ) + mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") mock_td_api = mocker.patch("hsfs.core.training_dataset_api.TrainingDatasetApi") mock_warning = mocker.patch("warnings.warn") @@ -209,8 +142,7 @@ def test_save_splits(self, mocker): assert ( mock_warning.call_args[0][0] == "Training dataset splits were defined but no `train_split` (the name of the split that is going to be " - "used for training) was provided. Setting this property to `train`. The statistics of this " - "split will be used for transformation functions." + "used for training) was provided. Setting this property to `train`. " ) def test_insert(self, mocker): diff --git a/python/tests/core/test_transformation_function_engine.py b/python/tests/core/test_transformation_function_engine.py index 11cd593cc3..e56e820d87 100644 --- a/python/tests/core/test_transformation_function_engine.py +++ b/python/tests/core/test_transformation_function_engine.py @@ -24,7 +24,7 @@ transformation_function, ) from hsfs.core import transformation_function_engine -from hsfs.hopsworks_udf import udf +from hsfs.hopsworks_udf import UDFType, udf fg1 = feature_group.FeatureGroup( @@ -98,6 +98,7 @@ def testFunction(col1): tf = transformation_function.TransformationFunction( feature_store_id, hopsworks_udf=testFunction, + transformation_type=UDFType.MODEL_DEPENDENT, ) # Act @@ -125,6 +126,7 @@ def testFunction1(col1): tf1 = transformation_function.TransformationFunction( feature_store_id, hopsworks_udf=testFunction1, + transformation_type=UDFType.MODEL_DEPENDENT, ) @udf(float) @@ -134,6 +136,7 @@ def testFunction2(data2, statistics_data2): tf2 = transformation_function.TransformationFunction( feature_store_id, hopsworks_udf=testFunction2, + transformation_type=UDFType.MODEL_DEPENDENT, ) transformations = [tf1, tf2] @@ -166,6 +169,7 @@ def testFunction1(col1): tf1 = transformation_function.TransformationFunction( feature_store_id, hopsworks_udf=testFunction1, + transformation_type=UDFType.MODEL_DEPENDENT, ) @udf(float) @@ -175,6 +179,7 @@ def testFunction2(data2, statistics_data2): tf2 = transformation_function.TransformationFunction( feature_store_id, hopsworks_udf=testFunction2, + transformation_type=UDFType.MODEL_DEPENDENT, ) transformations = [tf1, tf2] @@ -207,6 +212,7 @@ def testFunction1(col1): tf1 = transformation_function.TransformationFunction( feature_store_id, hopsworks_udf=testFunction1, + transformation_type=UDFType.MODEL_DEPENDENT, ) # Act @@ -266,6 +272,7 @@ def testFunction1(col1): tf1 = transformation_function.TransformationFunction( feature_store_id, hopsworks_udf=testFunction1, + transformation_type=UDFType.MODEL_DEPENDENT, ) fg1 = feature_group.FeatureGroup( @@ -325,6 +332,7 @@ def testFunction1(col1): tf1 = transformation_function.TransformationFunction( feature_store_id, hopsworks_udf=testFunction1, + transformation_type=UDFType.MODEL_DEPENDENT, ) fg1 = feature_group.FeatureGroup( @@ -383,6 +391,7 @@ def testFunction1(col1): tf1 = transformation_function.TransformationFunction( feature_store_id, hopsworks_udf=testFunction1, + transformation_type=UDFType.MODEL_DEPENDENT, ) fg1 = feature_group.FeatureGroup( @@ -439,6 +448,7 @@ def testFunction1(col1, statistics=stats): tf1 = transformation_function.TransformationFunction( feature_store_id, hopsworks_udf=testFunction1, + transformation_type=UDFType.MODEL_DEPENDENT, ) fg1 = feature_group.FeatureGroup( diff --git a/python/tests/engine/test_python.py b/python/tests/engine/test_python.py index 07958686de..c1ac202fba 100644 --- a/python/tests/engine/test_python.py +++ b/python/tests/engine/test_python.py @@ -2214,6 +2214,7 @@ def test_get_training_data(self, mocker): mocker.patch( "hsfs.core.transformation_function_engine.TransformationFunctionEngine" ) + mock_feature_view = mocker.patch("hsfs.feature_view.FeatureView") python_engine = python.Engine() @@ -2230,7 +2231,7 @@ def test_get_training_data(self, mocker): # Act python_engine.get_training_data( training_dataset_obj=td, - feature_view_obj=None, + feature_view_obj=mock_feature_view, query_obj=mocker.Mock(), read_options=None, dataframe_type="default", @@ -2964,6 +2965,7 @@ def test_write_training_dataset(self, mocker): def test_write_training_dataset_query_td(self, mocker, backend_fixtures): # Arrange + mocker.patch("hsfs.client.get_instance") mocker.patch("hsfs.engine.get_type") mocker.patch("hsfs.core.training_dataset_job_conf.TrainingDatasetJobConf") mock_job = mocker.patch("hsfs.core.job.Job") @@ -3008,6 +3010,7 @@ def test_write_training_dataset_query_td(self, mocker, backend_fixtures): def test_write_training_dataset_query_fv(self, mocker, backend_fixtures): # Arrange + mocker.patch("hsfs.client.get_instance") mocker.patch("hsfs.engine.get_type") mocker.patch("hsfs.core.training_dataset_job_conf.TrainingDatasetJobConf") mock_job = mocker.patch("hsfs.core.job.Job") @@ -3280,7 +3283,7 @@ def test_apply_transformation_function_multiple_output(self, mocker): engine._engine_type = "python" python_engine = python.Engine() - @udf([int, int]) + @udf([int, int], drop=["col1"]) def plus_two(col1): return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col1 + 2}) @@ -3324,7 +3327,7 @@ def test_apply_transformation_function_multiple_input_output(self, mocker): engine._engine_type = "python" python_engine = python.Engine() - @udf([int, int]) + @udf([int, int], drop=["col1", "col2"]) def plus_two(col1, col2): return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2}) @@ -3354,6 +3357,7 @@ def plus_two(col1, col2): ) # Assert + print(result.columns) assert all(result.columns == ["plus_two_col1_col2_0", "plus_two_col1_col2_1"]) assert len(result) == 2 assert result["plus_two_col1_col2_0"][0] == 2 diff --git a/python/tests/engine/test_python_spark_transformation_functions.py b/python/tests/engine/test_python_spark_transformation_functions.py index cb1a0652b5..71bb48cd05 100644 --- a/python/tests/engine/test_python_spark_transformation_functions.py +++ b/python/tests/engine/test_python_spark_transformation_functions.py @@ -31,7 +31,7 @@ from hsfs.client.exceptions import FeatureStoreException from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics from hsfs.engine import python, spark -from hsfs.hopsworks_udf import HopsworksUdf, udf +from hsfs.hopsworks_udf import HopsworksUdf, UDFType, udf from pyspark.sql.types import ( BooleanType, DateType, @@ -161,6 +161,7 @@ def test_apply_builtin_minmax_from_backend(self, mocker): "transformationFeatures": [], "statisticsArgumentNames": ["feature"], "name": "min_max_scaler", + "droppedFeatures": ["feature"], } tf_fun = HopsworksUdf.from_response_json(udf_response) @@ -169,7 +170,9 @@ def test_apply_builtin_minmax_from_backend(self, mocker): transformation_functions = [ transformation_function.TransformationFunction( - hopsworks_udf=tf_fun("col_0"), featurestore_id=99 + hopsworks_udf=tf_fun("col_0"), + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, ) ] @@ -230,7 +233,9 @@ def test_apply_builtin_minmax(self, mocker): transformation_functions = [ transformation_function.TransformationFunction( - hopsworks_udf=min_max_scaler("col_0"), featurestore_id=99 + hopsworks_udf=min_max_scaler("col_0"), + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, ) ] @@ -299,6 +304,7 @@ def test_apply_builtin_standard_scaler_from_backend(self, mocker): "transformationFeatures": [], "statisticsArgumentNames": ["feature"], "name": "standard_scaler", + "droppedFeatures": ["feature"], } tf_fun = HopsworksUdf.from_response_json(udf_response) @@ -307,7 +313,9 @@ def test_apply_builtin_standard_scaler_from_backend(self, mocker): transformation_functions = [ transformation_function.TransformationFunction( - hopsworks_udf=tf_fun("col_0"), featurestore_id=99 + hopsworks_udf=tf_fun("col_0"), + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, ) ] mean = statistics.mean([1, 2]) @@ -369,7 +377,9 @@ def test_apply_builtin_standard_scaler(self, mocker): transformation_functions = [ transformation_function.TransformationFunction( - hopsworks_udf=standard_scaler("col_0"), featurestore_id=99 + hopsworks_udf=standard_scaler("col_0"), + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, ) ] @@ -441,6 +451,7 @@ def test_apply_builtin_robust_scaler_from_backend(self, mocker): "transformationFeatures": [], "statisticsArgumentNames": ["feature"], "name": "robust_scaler", + "droppedFeatures": ["feature"], } tf_fun = HopsworksUdf.from_response_json(udf_response) @@ -449,7 +460,9 @@ def test_apply_builtin_robust_scaler_from_backend(self, mocker): transformation_functions = [ transformation_function.TransformationFunction( - hopsworks_udf=tf_fun("col_0"), featurestore_id=99 + hopsworks_udf=tf_fun("col_0"), + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, ) ] percentiles = [1] * 100 @@ -513,7 +526,9 @@ def test_apply_builtin_robust_scaler(self, mocker): transformation_functions = [ transformation_function.TransformationFunction( - hopsworks_udf=robust_scaler("col_0"), featurestore_id=99 + hopsworks_udf=robust_scaler("col_0"), + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, ) ] @@ -571,7 +586,7 @@ def test_apply_plus_one_int(self, mocker): ) # Arrange - @udf(int) + @udf(int, drop=["col_0"]) def tf_fun(col_0): return col_0 + 1 @@ -579,7 +594,9 @@ def tf_fun(col_0): transformation_functions = [ transformation_function.TransformationFunction( - hopsworks_udf=tf_fun, featurestore_id=99 + hopsworks_udf=tf_fun, + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, ) ] @@ -629,14 +646,16 @@ def test_apply_plus_one_str(self, mocker): ) # Arrange - @udf(str) + @udf(str, drop="col_0") def tf_fun(col_0): return col_0 + "1" td = self._create_training_dataset() transformation_functions = [ transformation_function.TransformationFunction( - hopsworks_udf=tf_fun, featurestore_id=99 + hopsworks_udf=tf_fun, + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, ) ] @@ -686,14 +705,16 @@ def test_apply_plus_one_double(self, mocker): spark_df = spark_engine._spark_session.createDataFrame(df, schema=schema) # Arrange - @udf(float) + @udf(float, drop="col_0") def tf_fun(col_0): return col_0 + 1.0 td = self._create_training_dataset() transformation_functions = [ transformation_function.TransformationFunction( - hopsworks_udf=tf_fun, featurestore_id=99 + hopsworks_udf=tf_fun, + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, ) ] @@ -758,7 +779,7 @@ def test_apply_plus_one_datetime_no_tz(self, mocker): ) # Arrange - @udf(datetime.datetime) + @udf(datetime.datetime, drop="col_0") def tf_fun(col_0): import datetime @@ -767,7 +788,9 @@ def tf_fun(col_0): td = self._create_training_dataset() transformation_functions = [ transformation_function.TransformationFunction( - hopsworks_udf=tf_fun, featurestore_id=99 + hopsworks_udf=tf_fun, + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, ) ] @@ -833,7 +856,7 @@ def test_apply_plus_one_datetime_tz_utc(self, mocker): ) # Arrange - @udf(datetime.datetime) + @udf(datetime.datetime, drop="col_0") def tf_fun(col_0) -> datetime.datetime: import datetime @@ -844,7 +867,9 @@ def tf_fun(col_0) -> datetime.datetime: td = self._create_training_dataset() transformation_functions = [ transformation_function.TransformationFunction( - hopsworks_udf=tf_fun, featurestore_id=99 + hopsworks_udf=tf_fun, + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, ) ] @@ -911,7 +936,7 @@ def test_apply_plus_one_datetime_tz_pst(self, mocker): ) # Arrange - @udf(datetime.datetime) + @udf(datetime.datetime, drop="col_0") def tf_fun(col_0) -> datetime.datetime: import datetime @@ -923,7 +948,9 @@ def tf_fun(col_0) -> datetime.datetime: td = self._create_training_dataset() transformation_functions = [ transformation_function.TransformationFunction( - hopsworks_udf=tf_fun, featurestore_id=99 + hopsworks_udf=tf_fun, + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, ) ] @@ -989,7 +1016,7 @@ def test_apply_plus_one_datetime_ts_none(self, mocker): ) # Arrange - @udf(datetime.datetime) + @udf(datetime.datetime, drop=["col_0"]) def tf_fun(col_0) -> datetime.datetime: import datetime @@ -1003,7 +1030,9 @@ def tf_fun(col_0) -> datetime.datetime: td = self._create_training_dataset() transformation_functions = [ transformation_function.TransformationFunction( - hopsworks_udf=tf_fun, featurestore_id=99 + hopsworks_udf=tf_fun, + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, ) ] @@ -1063,7 +1092,7 @@ def test_apply_plus_one_date(self, mocker): ) # Arrange - @udf(datetime.date) + @udf(datetime.date, drop=["col_0"]) def tf_fun(col_0): import datetime @@ -1072,7 +1101,9 @@ def tf_fun(col_0): td = self._create_training_dataset() transformation_functions = [ transformation_function.TransformationFunction( - hopsworks_udf=tf_fun, featurestore_id=99 + hopsworks_udf=tf_fun, + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, ) ] @@ -1089,7 +1120,7 @@ def test_apply_plus_one_invalid_type(self, mocker): # Arrange with pytest.raises(FeatureStoreException) as e_info: - @udf(list) + @udf(list, drop="a") def tf_fun(a): return a + 1 diff --git a/python/tests/engine/test_spark.py b/python/tests/engine/test_spark.py index 7eabd38d07..0de616084a 100644 --- a/python/tests/engine/test_spark.py +++ b/python/tests/engine/test_spark.py @@ -34,7 +34,7 @@ from hsfs.constructor import hudi_feature_group_alias, query from hsfs.core import training_dataset_engine from hsfs.engine import spark -from hsfs.hopsworks_udf import udf +from hsfs.hopsworks_udf import UDFType, udf from hsfs.training_dataset_feature import TrainingDatasetFeature from pyspark.sql import DataFrame from pyspark.sql.types import ( @@ -2675,6 +2675,7 @@ def plus_one(col1): tf = transformation_function.TransformationFunction( featurestore_id=99, hopsworks_udf=plus_one, + transformation_type=UDFType.MODEL_DEPENDENT, ) f = training_dataset_feature.TrainingDatasetFeature( @@ -2724,6 +2725,7 @@ def plus_one(col1): tf = transformation_function.TransformationFunction( featurestore_id=99, hopsworks_udf=plus_one, + transformation_type=UDFType.MODEL_DEPENDENT, ) transformation_fn_dict = dict() @@ -4328,13 +4330,12 @@ def test_apply_transformation_function_single_output(self, mocker): engine._engine_type = "spark" spark_engine = spark.Engine() - @udf(int) + @udf(int, drop=["col1"]) def plus_one(col1): return col1 + 1 tf = transformation_function.TransformationFunction( - 99, - hopsworks_udf=plus_one, + 99, hopsworks_udf=plus_one, transformation_type=UDFType.MODEL_DEPENDENT ) f = feature.Feature(name="col_0", type=IntegerType(), index=0) @@ -4388,13 +4389,12 @@ def test_apply_transformation_function_multiple_output(self, mocker): engine._engine_type = "spark" spark_engine = spark.Engine() - @udf([int, int]) + @udf([int, int], drop=["col1"]) def plus_two(col1): return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col1 + 2}) tf = transformation_function.TransformationFunction( - 99, - hopsworks_udf=plus_two, + 99, hopsworks_udf=plus_two, transformation_type=UDFType.MODEL_DEPENDENT ) f = feature.Feature(name="col_0", type=IntegerType(), index=0) @@ -4449,13 +4449,12 @@ def test_apply_transformation_function_multiple_input_output(self, mocker): engine._engine_type = "spark" spark_engine = spark.Engine() - @udf([int, int]) + @udf([int, int], drop=["col1", "col2"]) def test(col1, col2): return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2}) tf = transformation_function.TransformationFunction( - 99, - hopsworks_udf=test, + 99, hopsworks_udf=test, transformation_type=UDFType.MODEL_DEPENDENT ) f = feature.Feature(name="col_0", type=IntegerType(), index=0) diff --git a/python/tests/fixtures/feature_view_fixtures.json b/python/tests/fixtures/feature_view_fixtures.json index 5e229955bd..1ad25dea36 100644 --- a/python/tests/fixtures/feature_view_fixtures.json +++ b/python/tests/fixtures/feature_view_fixtures.json @@ -934,7 +934,8 @@ "name": "add_mean_fs", "outputTypes":["double"], "transformationFeatures":["data"], - "statisticsArgumentNames":["data1"] + "statisticsArgumentNames":["data1"], + "dropped_features":["data1"] } }, { @@ -945,7 +946,8 @@ "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", "name": "add_one_fs", "outputTypes":["double"], - "transformationFeatures":["col1"] + "transformationFeatures":["col1"], + "dropped_features":["data1"] } } ], diff --git a/python/tests/fixtures/training_dataset_fixtures.json b/python/tests/fixtures/training_dataset_fixtures.json index ea3f356e68..6db5d08325 100644 --- a/python/tests/fixtures/training_dataset_fixtures.json +++ b/python/tests/fixtures/training_dataset_fixtures.json @@ -122,21 +122,12 @@ "items": [ { "featurestore_id": 11, - "transformation_fn": "test_transformation_fn", "version": 1, "name": "test_name", - "source_code_content": "test_source_code_content", - "builtin_source_code": "test_builtin_source_code", - "output_type": "test_output_type", - "id": 43, - "type": "transformationFunctionTDO", - "items": [], - "count": 0, "href": "test_href" } ] - }, - "transformation_function": "test_transformation_function" + } } ], "statistics_config": { @@ -153,7 +144,6 @@ "from_query": "test_from_query", "querydto": "test_querydto", "label": "test_label", - "transformation_functions": "test_transformation_functions", "train_split": "test_train_split", "time_split_size": "test_time_split_size", "type": "trainingDatasetDTO" diff --git a/python/tests/fixtures/transformation_function_fixtures.json b/python/tests/fixtures/transformation_function_fixtures.json index 6fa5d762b7..036eb2fac7 100644 --- a/python/tests/fixtures/transformation_function_fixtures.json +++ b/python/tests/fixtures/transformation_function_fixtures.json @@ -8,7 +8,8 @@ "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", "name": "add_one_fs", "outputTypes":["double"], - "transformationFeatures":["col1"] + "transformationFeatures":["col1"], + "dropped_features":["data1"] } } }, @@ -22,7 +23,8 @@ "name": "add_mean_fs", "outputTypes":["double"], "transformationFeatures":["data"], - "statisticsArgumentNames":["data1"] + "statisticsArgumentNames":["data1"], + "dropped_features":["data1"] } } }, @@ -36,7 +38,8 @@ "name": "test_func", "outputTypes":["string"], "transformationFeatures":["feature1", "feature2", "feature3"], - "statisticsArgumentNames":["data1", "data2"] + "statisticsArgumentNames":["data1", "data2"], + "dropped_features":["data1", "data2", "data3"] } } }, @@ -50,7 +53,8 @@ "name": "test_func", "outputTypes":["string", "double"], "transformationFeatures":["feature1", "feature2", "feature3"], - "statisticsArgumentNames":["data1", "data2"] + "statisticsArgumentNames":["data1", "data2"], + "dropped_features":["data1", "data2", "data3"] } } }, @@ -67,7 +71,8 @@ "name": "add_mean_fs", "outputTypes":["double"], "transformationFeatures":["data"], - "statisticsArgumentNames":["data1"] + "statisticsArgumentNames":["data1"], + "dropped_features":["data1"] } }, { @@ -78,7 +83,8 @@ "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", "name": "add_one_fs", "outputTypes":["double"], - "transformationFeatures":["col1"] + "transformationFeatures":["col1"], + "dropped_features":["data1"] } } ] @@ -97,7 +103,8 @@ "name": "add_mean_fs", "outputTypes":["double"], "transformationFeatures":["data"], - "statisticsArgumentNames":["data1"] + "statisticsArgumentNames":["data1"], + "dropped_features":["data1"] } } ] diff --git a/python/tests/test_builtin_transformation.py b/python/tests/test_builtin_transformation.py new file mode 100644 index 0000000000..4a8a01af9c --- /dev/null +++ b/python/tests/test_builtin_transformation.py @@ -0,0 +1,81 @@ +# +# Copyright 2024 Hopsworks AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import hsfs.engine as engine +import pandas as pd +from hsfs.builtin_transformations import ( + min_max_scaler, +) +from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics +from hsfs.engine import python +from hsfs.hopsworks_udf import UDFType + + +class TestBuiltinTransformations: + @staticmethod + def validate_transformations_python( + transformed_outputs, expected_output, expected_col_names + ): + if isinstance(transformed_outputs, pd.Series): + assert transformed_outputs.name == expected_col_names + else: + assert all(transformed_outputs.columns == expected_col_names) + assert all(transformed_outputs.values == expected_output.values) + + def test_min_max_scaler(self): + test_dataframe = pd.DataFrame( + { + "col1": [1, 2, 3, 4], + "col2": [1.2, 3.4, 5.6, 9.1], + } + ) + statistics_df = test_dataframe.describe().to_dict() + + # Test case 1 : Integer column + min_max_scaler_col1 = min_max_scaler("col1") + min_max_scaler_col1.udf_type = UDFType.MODEL_DEPENDENT + + min_max_scaler_col1.transformation_statistics = [ + FeatureDescriptiveStatistics( + feature_name="col1", + min=statistics_df["col1"]["min"], + max=statistics_df["col1"]["max"], + ) + ] + + expected_df = (test_dataframe["col1"] - test_dataframe["col1"].min()) / ( + test_dataframe["col1"].max() - test_dataframe["col1"].min() + ) + + # Test with python engine + engine.set_instance(engine=python.Engine(), engine_type="python") + + transformed_df = min_max_scaler_col1.get_udf()(test_dataframe["col1"]) + TestBuiltinTransformations.validate_transformations_python( + transformed_outputs=transformed_df, + expected_output=expected_df, + expected_col_names="min_max_scaler_col1_", + ) + + # Test with spark engine + engine.set_instance(engine=python.Engine(), engine_type="python") + + transformed_df = min_max_scaler_col1.get_udf()(test_dataframe["col1"]) + TestBuiltinTransformations.validate_transformations_python( + transformed_outputs=transformed_df, + expected_output=expected_df, + expected_col_names="min_max_scaler_col1_", + ) diff --git a/python/tests/test_hopswork_udf.py b/python/tests/test_hopswork_udf.py index 6595207ed3..fe9531b751 100644 --- a/python/tests/test_hopswork_udf.py +++ b/python/tests/test_hopswork_udf.py @@ -342,7 +342,7 @@ def test_func(col1, col2, col3): return col1 + 1 test_func.udf_type = UDFType.MODEL_DEPENDENT - assert test_func._get_output_column_names() == ["test_func_col1-col2-col3_"] + assert test_func._get_output_column_names() == ["test_func_col1_col2_col3_"] test_func.udf_type = UDFType.ON_DEMAND assert test_func._get_output_column_names() == ["test_func"] @@ -369,9 +369,9 @@ def test_func(col1, col2, col3): test_func.udf_type = UDFType.MODEL_DEPENDENT assert test_func._get_output_column_names() == [ - "test_func_col1-col2-col3_0", - "test_func_col1-col2-col3_1", - "test_func_col1-col2-col3_2", + "test_func_col1_col2_col3_0", + "test_func_col1_col2_col3_1", + "test_func_col1_col2_col3_2", ] def test_create_pandas_udf_return_schema_from_list_one_output_type(self): @@ -446,7 +446,7 @@ def test_func(col1, col2): test_dataframe["column1"], test_dataframe["column2"] ) - assert all(result.columns == ["test_func_col1-col2_0", "test_func_col1-col2_1"]) + assert all(result.columns == ["test_func_col1_col2_0", "test_func_col1_col2_1"]) assert result.values.tolist() == [[2, 12], [3, 22], [4, 32], [5, 42]] def test_HopsworkUDf_call_one_argument(self): diff --git a/python/tests/test_training_dataset.py b/python/tests/test_training_dataset.py index 416f3cb860..be771406b2 100644 --- a/python/tests/test_training_dataset.py +++ b/python/tests/test_training_dataset.py @@ -57,7 +57,6 @@ def test_from_response_json(self, mocker, backend_fixtures): assert td._from_query == "test_from_query" assert td._querydto == "test_querydto" assert td.feature_store_id == 22 - assert td.transformation_functions == "test_transformation_functions" assert td.train_split == "test_train_split" assert td.training_dataset_type == "HOPSFS_TRAINING_DATASET" assert isinstance(td.storage_connector, storage_connector.JdbcConnector) @@ -102,7 +101,6 @@ def test_from_response_json_basic_info(self, mocker, backend_fixtures): assert td._from_query is None assert td._querydto is None assert td.feature_store_id == 22 - assert td.transformation_functions is None assert td.train_split is None assert td.training_dataset_type is None assert isinstance(td.storage_connector, storage_connector.JdbcConnector) diff --git a/python/tests/test_transformation_function.py b/python/tests/test_transformation_function.py index bfc2f125d0..0b83832755 100644 --- a/python/tests/test_transformation_function.py +++ b/python/tests/test_transformation_function.py @@ -17,7 +17,7 @@ import pytest from hsfs.client.exceptions import FeatureStoreException -from hsfs.hopsworks_udf import udf +from hsfs.hopsworks_udf import UDFType, udf from hsfs.transformation_function import TransformationFunction @@ -27,7 +27,7 @@ def test_from_response_json_one_argument_no_statistics(self, backend_fixtures): json = backend_fixtures["transformation_function"][ "get_one_argument_no_statistics_function" ]["response"] - + json["transformation_type"] = UDFType.MODEL_DEPENDENT # Act tf = TransformationFunction.from_response_json(json) @@ -51,6 +51,7 @@ def test_from_response_json_one_argument_with_statistics(self, backend_fixtures) json = backend_fixtures["transformation_function"][ "get_one_argument_with_statistics_function" ]["response"] + json["transformation_type"] = UDFType.MODEL_DEPENDENT # Act tf = TransformationFunction.from_response_json(json) @@ -77,6 +78,7 @@ def test_from_response_json_multiple_argument_with_statistics( json = backend_fixtures["transformation_function"][ "get_multiple_argument_with_statistics_function" ]["response"] + json["transformation_type"] = UDFType.MODEL_DEPENDENT # Act tf = TransformationFunction.from_response_json(json) @@ -105,6 +107,7 @@ def test_from_response_json_multiple_return_type_functions(self, backend_fixture json = backend_fixtures["transformation_function"][ "get_multiple_return_type_functions" ]["response"] + json["transformation_type"] = UDFType.MODEL_DEPENDENT # Act tf = TransformationFunction.from_response_json(json) @@ -141,6 +144,8 @@ def test_from_response_json_list_empty(self, backend_fixtures): def test_from_response_json_list(self, backend_fixtures): # Arrange json = backend_fixtures["transformation_function"]["get_list"]["response"] + for response_json in json["items"]: + response_json["transformation_type"] = UDFType.MODEL_DEPENDENT # Act tf_list = TransformationFunction.from_response_json(json) @@ -182,6 +187,8 @@ def test_from_response_json_list_one_argument(self, backend_fixtures): json = backend_fixtures["transformation_function"]["get_list_one_argument"][ "response" ] + for response_json in json["items"]: + response_json["transformation_type"] = UDFType.MODEL_DEPENDENT # Act tf = TransformationFunction.from_response_json(json) @@ -210,6 +217,7 @@ def test(col1): TransformationFunction( featurestore_id=10, hopsworks_udf=test, + transformation_type=UDFType.MODEL_DEPENDENT, ) assert ( @@ -225,6 +233,7 @@ def test2(col1): tf = TransformationFunction( featurestore_id=10, hopsworks_udf=test2, + transformation_type=UDFType.MODEL_DEPENDENT, ) assert tf.hopsworks_udf == test2 From 202358db28a26cc9889d8f7e982187b9504e0914 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Thu, 4 Jul 2024 11:26:08 +0200 Subject: [PATCH 55/58] adding unit tests for on-demand transformation functions --- python/hsfs/hopsworks_udf.py | 3 +- python/tests/engine/test_python.py | 217 +++++++++- python/tests/engine/test_spark.py | 373 +++++++++++++++++- python/tests/fixtures/feature_fixtures.json | 16 + .../fixtures/feature_group_fixtures.json | 122 ++++++ .../training_dataset_feature_fixtures.json | 78 ++++ python/tests/test_builtin_transformation.py | 81 ---- python/tests/test_feature.py | 20 + python/tests/test_feature_group.py | 59 ++- python/tests/test_feature_view.py | 11 +- python/tests/test_hopswork_udf.py | 283 +++++++++++++ python/tests/test_training_dataset_feature.py | 35 ++ 12 files changed, 1209 insertions(+), 89 deletions(-) delete mode 100644 python/tests/test_builtin_transformation.py diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index a17e432009..10e8135293 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -206,7 +206,8 @@ def _validate_and_convert_drop_features( missing_drop_features = [] for dropped_feature in dropped_features: - if feature_name_prefix + dropped_feature not in transformation_feature: + dropped_feature = feature_name_prefix + dropped_feature + if dropped_feature not in transformation_feature: missing_drop_features.append(dropped_feature) if missing_drop_features: diff --git a/python/tests/engine/test_python.py b/python/tests/engine/test_python.py index c1ac202fba..cbbe190c4d 100644 --- a/python/tests/engine/test_python.py +++ b/python/tests/engine/test_python.py @@ -36,8 +36,9 @@ from hsfs.constructor.hudi_feature_group_alias import HudiFeatureGroupAlias from hsfs.core import inode, job from hsfs.engine import python -from hsfs.hopsworks_udf import udf +from hsfs.hopsworks_udf import UDFType, udf from hsfs.training_dataset_feature import TrainingDatasetFeature +from hsfs.transformation_function import TransformationFunction from polars.testing import assert_frame_equal as polars_assert_frame_equal @@ -1460,7 +1461,6 @@ def test_parse_schema_feature_group_polars(self, mocker): result = python_engine.parse_schema_feature_group( dataframe=df, time_travel_format=None ) - print(result) # Assert assert len(result) == 3 @@ -1468,6 +1468,71 @@ def test_parse_schema_feature_group_polars(self, mocker): assert result[1].name == "col2" assert result[2].name == "date" + def test_parse_schema_feature_group_transformation_functions(self, mocker): + # Arrange + mocker.patch("hsfs.engine.python.Engine._convert_pandas_dtype_to_offline_type") + + python_engine = python.Engine() + + d = {"Col1": [1, 2], "col2": [3, 4]} + df = pd.DataFrame(data=d) + + @udf(int) + def test(feature): + return feature + 1 + + transformation_function = TransformationFunction( + featurestore_id=10, + hopsworks_udf=test, + version=1, + transformation_type=UDFType.ON_DEMAND, + ) + + # Act + result = python_engine.parse_schema_feature_group( + dataframe=df, + time_travel_format=None, + transformation_functions=[transformation_function], + ) + + # Assert + assert len(result) == 3 + assert result[0].name == "col1" + assert result[1].name == "col2" + assert result[2].name == "test" + + def test_parse_schema_feature_group_transformation_functions_drop(self, mocker): + # Arrange + mocker.patch("hsfs.engine.python.Engine._convert_pandas_dtype_to_offline_type") + + python_engine = python.Engine() + + d = {"Col1": [1, 2], "col2": [3, 4]} + df = pd.DataFrame(data=d) + + @udf(int, drop="feature") + def test(feature): + return feature + 1 + + transformation_function = TransformationFunction( + featurestore_id=10, + hopsworks_udf=test("col2"), + version=1, + transformation_type=UDFType.ON_DEMAND, + ) + + # Act + result = python_engine.parse_schema_feature_group( + dataframe=df, + time_travel_format=None, + transformation_functions=[transformation_function], + ) + + # Assert + assert len(result) == 2 + assert result[0].name == "col1" + assert result[1].name == "test" + def test_parse_schema_training_dataset(self): # Arrange python_engine = python.Engine() @@ -2136,6 +2201,52 @@ def test_save_dataframe(self, mocker): assert mock_python_engine_write_dataframe_kafka.call_count == 0 assert mock_python_engine_legacy_save_dataframe.call_count == 1 + def test_save_dataframe_transformation_functions(self, mocker): + # Arrange + mock_python_engine_write_dataframe_kafka = mocker.patch( + "hsfs.engine.python.Engine._write_dataframe_kafka" + ) + mock_python_engine_legacy_save_dataframe = mocker.patch( + "hsfs.engine.python.Engine.legacy_save_dataframe" + ) + mock_python_engine_apply_transformations = mocker.patch( + "hsfs.engine.python.Engine._apply_transformation_function" + ) + + python_engine = python.Engine() + + @udf(int) + def test(feature): + return feature + 1 + + fg = feature_group.FeatureGroup( + name="test", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + id=10, + stream=False, + transformation_functions=[test], + ) + + # Act + python_engine.save_dataframe( + feature_group=fg, + dataframe=None, + operation=None, + online_enabled=None, + storage=None, + offline_write_options=None, + online_write_options=None, + validation_id=None, + ) + + # Assert + assert mock_python_engine_write_dataframe_kafka.call_count == 0 + assert mock_python_engine_legacy_save_dataframe.call_count == 1 + assert mock_python_engine_apply_transformations.call_count == 1 + def test_save_dataframe_stream(self, mocker): # Arrange mock_python_engine_write_dataframe_kafka = mocker.patch( @@ -3327,6 +3438,57 @@ def test_apply_transformation_function_multiple_input_output(self, mocker): engine._engine_type = "python" python_engine = python.Engine() + @udf([int, int]) + def plus_two(col1, col2): + return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2}) + + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, + ) + + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), + featurestore_id=99, + transformation_functions=[plus_two], + ) + + df = pd.DataFrame(data={"col1": [1, 2], "col2": [10, 11]}) + + # Act + result = python_engine._apply_transformation_function( + transformation_functions=fv.transformation_functions, dataset=df + ) + + # Assert + assert all( + result.columns + == ["col1", "col2", "plus_two_col1_col2_0", "plus_two_col1_col2_1"] + ) + assert len(result) == 2 + assert result["col1"][0] == 1 + assert result["col1"][1] == 2 + assert result["col2"][0] == 10 + assert result["col2"][1] == 11 + assert result["plus_two_col1_col2_0"][0] == 2 + assert result["plus_two_col1_col2_0"][1] == 3 + assert result["plus_two_col1_col2_1"][0] == 12 + assert result["plus_two_col1_col2_1"][1] == 13 + + def test_apply_transformation_function_multiple_input_output_drop_all(self, mocker): + # Arrange + mocker.patch("hsfs.client.get_instance") + + engine._engine_type = "python" + python_engine = python.Engine() + @udf([int, int], drop=["col1", "col2"]) def plus_two(col1, col2): return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2}) @@ -3357,7 +3519,6 @@ def plus_two(col1, col2): ) # Assert - print(result.columns) assert all(result.columns == ["plus_two_col1_col2_0", "plus_two_col1_col2_1"]) assert len(result) == 2 assert result["plus_two_col1_col2_0"][0] == 2 @@ -3365,6 +3526,56 @@ def plus_two(col1, col2): assert result["plus_two_col1_col2_1"][0] == 12 assert result["plus_two_col1_col2_1"][1] == 13 + def test_apply_transformation_function_multiple_input_output_drop_some( + self, mocker + ): + # Arrange + mocker.patch("hsfs.client.get_instance") + + engine._engine_type = "python" + python_engine = python.Engine() + + @udf([int, int], drop=["col1"]) + def plus_two(col1, col2): + return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2}) + + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, + ) + + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), + featurestore_id=99, + transformation_functions=[plus_two], + ) + + df = pd.DataFrame(data={"col1": [1, 2], "col2": [10, 11]}) + + # Act + result = python_engine._apply_transformation_function( + transformation_functions=fv.transformation_functions, dataset=df + ) + + # Assert + assert all( + result.columns == ["col2", "plus_two_col1_col2_0", "plus_two_col1_col2_1"] + ) + assert len(result) == 2 + assert result["col2"][0] == 10 + assert result["col2"][1] == 11 + assert result["plus_two_col1_col2_0"][0] == 2 + assert result["plus_two_col1_col2_0"][1] == 3 + assert result["plus_two_col1_col2_1"][0] == 12 + assert result["plus_two_col1_col2_1"][1] == 13 + def test_apply_transformation_function_polars(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") diff --git a/python/tests/engine/test_spark.py b/python/tests/engine/test_spark.py index 0de616084a..5e31959ef4 100644 --- a/python/tests/engine/test_spark.py +++ b/python/tests/engine/test_spark.py @@ -469,7 +469,6 @@ def test_convert_to_default_dataframe_pyspark_rdd(self): # Assert result_df = result.toPandas() - print(result_df) assert list(result_df) == list(expected) for column in list(result_df): assert result_df[column].equals(result_df[column]) @@ -644,6 +643,51 @@ def test_save_dataframe(self, mocker): assert mock_spark_engine_save_online_dataframe.call_count == 0 assert mock_spark_engine_save_offline_dataframe.call_count == 1 + def test_save_dataframe_transformations(self, mocker): + # Arrange + mock_spark_engine_save_online_dataframe = mocker.patch( + "hsfs.engine.spark.Engine._save_online_dataframe" + ) + mock_spark_engine_save_offline_dataframe = mocker.patch( + "hsfs.engine.spark.Engine._save_offline_dataframe" + ) + mock_spark_engine_apply_transformations = mocker.patch( + "hsfs.engine.spark.Engine._apply_transformation_function" + ) + + spark_engine = spark.Engine() + + @udf(int) + def test(feature): + return feature + 1 + + fg = feature_group.FeatureGroup( + name="test", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + id=10, + transformation_functions=[test], + ) + + # Act + spark_engine.save_dataframe( + feature_group=fg, + dataframe=None, + operation=None, + online_enabled=None, + storage=None, + offline_write_options=None, + online_write_options=None, + validation_id=None, + ) + + # Assert + assert mock_spark_engine_save_online_dataframe.call_count == 0 + assert mock_spark_engine_save_offline_dataframe.call_count == 1 + assert mock_spark_engine_apply_transformations.call_count == 1 + def test_save_dataframe_storage_offline(self, mocker): # Arrange mock_spark_engine_save_online_dataframe = mocker.patch( @@ -979,6 +1023,135 @@ def test_save_stream_dataframe(self, mocker, backend_fixtures): == 0 ) + def test_save_stream_dataframe_transformations(self, mocker, backend_fixtures): + # Arrange + mock_client_get_instance = mocker.patch("hsfs.client.get_instance") + mocker.patch("hsfs.engine.spark.Engine._encode_complex_features") + mock_spark_engine_online_fg_to_avro = mocker.patch( + "hsfs.engine.spark.Engine._online_fg_to_avro" + ) + + mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") + mock_engine_get_instance.return_value.add_file.return_value = ( + "result_from_add_file" + ) + + mock_storage_connector_api = mocker.patch( + "hsfs.core.storage_connector_api.StorageConnectorApi" + ) + + mock_spark_engine_apply_transformations = mocker.patch( + "hsfs.engine.spark.Engine._apply_transformation_function" + ) + + json = backend_fixtures["storage_connector"]["get_kafka_external"]["response"] + sc = storage_connector.StorageConnector.from_response_json(json) + mock_storage_connector_api.return_value.get_kafka_connector.return_value = sc + + spark_engine = spark.Engine() + + @udf(int) + def test(feature): + return feature + 1 + + fg = feature_group.FeatureGroup( + name="test", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + id=10, + online_topic_name="test_online_topic_name", + transformation_functions=[test], + ) + fg.feature_store = mocker.Mock() + project_id = 1 + fg.feature_store.project_id = project_id + + mock_client_get_instance.return_value._project_name = "test_project_name" + + # Act + spark_engine.save_stream_dataframe( + feature_group=fg, + dataframe=None, + query_name=None, + output_mode="test_mode", + await_termination=None, + timeout=None, + checkpoint_dir=None, + write_options={"test_name": "test_value"}, + ) + + # Assert + assert ( + mock_spark_engine_online_fg_to_avro.return_value.withColumn.call_args[0][0] + == "headers" + ) + assert ( + mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.call_args[ + 0 + ][0] + == "test_mode" + ) + assert ( + mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.call_args[ + 0 + ][0] + == "kafka" + ) + assert ( + mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ + 0 + ][0] + == "checkpointLocation" + ) + assert ( + mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ + 0 + ][1] + == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}-checkpoint" + ) + assert ( + mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ + 1 + ] + == { + "kafka.bootstrap.servers": "test_bootstrap_servers", + "kafka.security.protocol": "test_security_protocol", + "kafka.ssl.endpoint.identification.algorithm": "test_ssl_endpoint_identification_algorithm", + "kafka.ssl.key.password": "test_ssl_key_password", + "kafka.ssl.keystore.location": "result_from_add_file", + "kafka.ssl.keystore.password": "test_ssl_keystore_password", + "kafka.ssl.truststore.location": "result_from_add_file", + "kafka.ssl.truststore.password": "test_ssl_truststore_password", + "kafka.test_option_name": "test_option_value", + "test_name": "test_value", + } + ) + assert ( + mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ + 0 + ][0] + == "topic" + ) + assert ( + mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ + 0 + ][1] + == "test_online_topic_name" + ) + assert ( + mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.call_args[ + 0 + ][0] + == self._get_spark_query_name(project_id, fg) + ) + assert ( + mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_count + == 0 + ) + assert mock_spark_engine_apply_transformations.call_count == 1 + def test_save_stream_dataframe_query_name(self, mocker, backend_fixtures): # Arrange mock_client_get_instance = mocker.patch("hsfs.client.get_instance") @@ -3711,6 +3884,81 @@ def test_parse_schema_feature_group(self, mocker): assert mock_spark_engine_convert_spark_type.call_count == 2 assert mock_spark_engine_convert_spark_type.call_args[0][1] is False + def test_parse_schema_feature_group_transformations(self, mocker): + # Arrange + mock_spark_engine_convert_spark_type = mocker.patch( + "hsfs.engine.spark.Engine.convert_spark_type_to_offline_type" + ) + + spark_engine = spark.Engine() + + d = {"col_0": [1, 2], "col_1": ["test_1", "test_2"]} + df = pd.DataFrame(data=d) + + @udf(int) + def test(feature): + return feature + 1 + + tf_function = transformation_function.TransformationFunction( + featurestore_id=10, + hopsworks_udf=test, + version=1, + transformation_type=UDFType.ON_DEMAND, + ) + + spark_df = spark_engine._spark_session.createDataFrame(df) + + # Act + result = spark_engine.parse_schema_feature_group( + dataframe=spark_df, + time_travel_format=None, + transformation_functions=[tf_function], + ) + + # Assert + assert result[0].name == "col_0" + assert result[1].name == "col_1" + assert result[2].name == "test" + assert mock_spark_engine_convert_spark_type.call_count == 2 + assert mock_spark_engine_convert_spark_type.call_args[0][1] is False + + def test_parse_schema_feature_group_transformations_dropped(self, mocker): + # Arrange + mock_spark_engine_convert_spark_type = mocker.patch( + "hsfs.engine.spark.Engine.convert_spark_type_to_offline_type" + ) + + spark_engine = spark.Engine() + + d = {"col_0": [1, 2], "col_1": ["test_1", "test_2"]} + df = pd.DataFrame(data=d) + + @udf(int, drop="feature") + def test(feature): + return feature + 1 + + tf_function = transformation_function.TransformationFunction( + featurestore_id=10, + hopsworks_udf=test("col_0"), + version=1, + transformation_type=UDFType.ON_DEMAND, + ) + + spark_df = spark_engine._spark_session.createDataFrame(df) + + # Act + result = spark_engine.parse_schema_feature_group( + dataframe=spark_df, + time_travel_format=None, + transformation_functions=[tf_function], + ) + + # Assert + assert result[0].name == "col_1" + assert result[1].name == "test" + assert mock_spark_engine_convert_spark_type.call_count == 2 + assert mock_spark_engine_convert_spark_type.call_args[0][1] is False + def test_parse_schema_feature_group_hudi(self, mocker): # Arrange mock_spark_engine_convert_spark_type = mocker.patch( @@ -4449,6 +4697,129 @@ def test_apply_transformation_function_multiple_input_output(self, mocker): engine._engine_type = "spark" spark_engine = spark.Engine() + @udf([int, int]) + def test(col1, col2): + return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2}) + + tf = transformation_function.TransformationFunction( + 99, hopsworks_udf=test, transformation_type=UDFType.MODEL_DEPENDENT + ) + + f = feature.Feature(name="col_0", type=IntegerType(), index=0) + f1 = feature.Feature(name="col_1", type=StringType(), index=1) + f2 = feature.Feature(name="col_2", type=IntegerType(), index=1) + features = [f, f1, f2] + fg1 = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=features, + id=11, + stream=False, + ) + fv = feature_view.FeatureView( + name="test", + featurestore_id=99, + query=fg1.select_all(), + transformation_functions=[tf("col_0", "col_2")], + ) + + d = {"col_0": [1, 2], "col_1": ["test_1", "test_2"], "col_2": [10, 11]} + df = pd.DataFrame(data=d) + + spark_df = spark_engine._spark_session.createDataFrame(df) + + expected_df = pd.DataFrame( + data={ + "col_0": [1, 2], + "col_1": ["test_1", "test_2"], + "col_2": [10, 11], + "test_col_0_col_2_0": [2, 3], + "test_col_0_col_2_1": [12, 13], + } + ) + + expected_spark_df = spark_engine._spark_session.createDataFrame(expected_df) + + # Act + result = spark_engine._apply_transformation_function( + transformation_functions=fv.transformation_functions, + dataset=spark_df, + ) + # Assert + assert result.schema == expected_spark_df.schema + assert result.collect() == expected_spark_df.collect() + + def test_apply_transformation_function_multiple_input_output_drop_some( + self, mocker + ): + # Arrange + mocker.patch("hsfs.client.get_instance") + engine._engine_type = "spark" + spark_engine = spark.Engine() + + @udf([int, int], drop=["col1"]) + def test(col1, col2): + return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2}) + + tf = transformation_function.TransformationFunction( + 99, hopsworks_udf=test, transformation_type=UDFType.MODEL_DEPENDENT + ) + + f = feature.Feature(name="col_0", type=IntegerType(), index=0) + f1 = feature.Feature(name="col_1", type=StringType(), index=1) + f2 = feature.Feature(name="col_2", type=IntegerType(), index=1) + features = [f, f1, f2] + fg1 = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=features, + id=11, + stream=False, + ) + fv = feature_view.FeatureView( + name="test", + featurestore_id=99, + query=fg1.select_all(), + transformation_functions=[tf("col_0", "col_2")], + ) + + d = {"col_0": [1, 2], "col_1": ["test_1", "test_2"], "col_2": [10, 11]} + df = pd.DataFrame(data=d) + + spark_df = spark_engine._spark_session.createDataFrame(df) + + expected_df = pd.DataFrame( + data={ + "col_1": ["test_1", "test_2"], + "col_2": [10, 11], + "test_col_0_col_2_0": [2, 3], + "test_col_0_col_2_1": [12, 13], + } + ) + + expected_spark_df = spark_engine._spark_session.createDataFrame(expected_df) + + # Act + result = spark_engine._apply_transformation_function( + transformation_functions=fv.transformation_functions, + dataset=spark_df, + ) + # Assert + assert result.schema == expected_spark_df.schema + assert result.collect() == expected_spark_df.collect() + + def test_apply_transformation_function_multiple_input_output_drop_all(self, mocker): + # Arrange + mocker.patch("hsfs.client.get_instance") + engine._engine_type = "spark" + spark_engine = spark.Engine() + @udf([int, int], drop=["col1", "col2"]) def test(col1, col2): return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2}) diff --git a/python/tests/fixtures/feature_fixtures.json b/python/tests/fixtures/feature_fixtures.json index 1d486c0cc4..c9b330768e 100644 --- a/python/tests/fixtures/feature_fixtures.json +++ b/python/tests/fixtures/feature_fixtures.json @@ -9,6 +9,22 @@ "partition": false, "primary": true, "type": "int", + "on_demand": false, + "description": "test_description", + "feature_group": null + } + }, + "get_on_demand": { + "response": { + "defaultValue": "1", + "featureGroupId": 15, + "hudiPrecombineKey": true, + "name": "intt", + "onlineType": "int", + "partition": false, + "primary": true, + "type": "int", + "on_demand": true, "description": "test_description", "feature_group": null } diff --git a/python/tests/fixtures/feature_group_fixtures.json b/python/tests/fixtures/feature_group_fixtures.json index 484a9e288d..c2394ed4cb 100644 --- a/python/tests/fixtures/feature_group_fixtures.json +++ b/python/tests/fixtures/feature_group_fixtures.json @@ -630,5 +630,127 @@ "version": 1 }, "headers": null + }, + "get_transformations": { + "response": { + "type": "cachedFeaturegroupDTO", + "validation_type": "test_validation_type", + "created": "2022-08-01T11:07:55Z", + "creator": { + "email": "admin@hopsworks.ai", + "firstName": "Admin", + "lastName": "Admin", + "maxNumProjects": 0, + "numActiveProjects": 0, + "numRemainingProjects": 0, + "status": 0, + "testUser": false, + "tos": false, + "toursState": 0, + "twoFactor": false + }, + "description": "test_description", + "featurestoreId": 67, + "featurestoreName": "test_featurestore", + "id": 15, + "location": "hopsfs://10.0.2.15:8020/apps/hive/warehouse/test_featurestore.db/fg_test_1", + "name": "fg_test", + "statisticsConfig": { + "columns": [], + "correlations": false, + "enabled": true, + "exactUniqueness": false, + "histograms": false + }, + "version": 1, + "features": [ + { + "defaultValue": null, + "featureGroupId": 15, + "hudiPrecombineKey": true, + "name": "intt", + "onlineType": "int", + "partition": false, + "primary": true, + "type": "int" + }, + { + "defaultValue": null, + "featureGroupId": 15, + "hudiPrecombineKey": false, + "name": "stringt", + "onlineType": "varchar(1000)", + "partition": false, + "primary": false, + "type": "string" + } + ], + "transformation_functions":[ + { + "id" : 1, + "version": 2, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@udf(float)\ndef add_two(data1 : pd.Series):\n return data1 + 2\n", + "name": "add_two", + "outputTypes":["double"], + "transformationFeatures":["data"], + "dropped_features":["data1"] + } + }, + { + "id" : 2, + "version": 1, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", + "name": "add_one_fs", + "outputTypes":["double"], + "transformationFeatures":["col1"], + "dropped_features":["data1"] + } + } + ], + "onlineTopicName": "119_15_fg_test_1_onlinefs", + "onlineEnabled": true, + "timeTravelFormat": "HUDI", + "expectationSuite": { + "expectation_suite_name": "test_expectation_suite_name", + "expectations": [ + { + "expectation_type": "1", + "kwargs": "{ \"kwargs_key\": \"kwargs_value\" }", + "meta": "{ \"meta_key\": \"meta_value\" }", + "id": 32 + } + ], + "meta": "{ \"great_expectations_version\": \"0.15.12\", \"key\": \"value\" }", + "id": 21, + "data_asset_type": "test_data_asset_type", + "ge_cloud_id": "test_ge_cloud_id", + "run_validation": "test_run_validation", + "validation_ingestion_policy": "test_validation_ingestion_policy", + "feature_store_id": 67, + "feature_group_id": 15, + "href": "test_/featurestores/67/featuregroups/15/expectationsuite", + "expand": "test_expand", + "items": "test_items", + "type": "expectationSuiteDTO", + "created": "test_created" + } + }, + "method": "GET", + "path_params": [ + "project", + "119", + "featurestores", + 67, + "featuregroups", + "fg_test" + ], + "query_params": { + "version": 1 + }, + "headers": null } } diff --git a/python/tests/fixtures/training_dataset_feature_fixtures.json b/python/tests/fixtures/training_dataset_feature_fixtures.json index f48fd0fabd..27cd07f302 100644 --- a/python/tests/fixtures/training_dataset_feature_fixtures.json +++ b/python/tests/fixtures/training_dataset_feature_fixtures.json @@ -65,6 +65,84 @@ "label": "test_label" } }, + "get_transformations": { + "response": { + "name": "test_name", + "type": "test_type", + "index": "test_index", + "transformation_function": { + "id" : 2, + "version": 1, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", + "name": "add_one_fs", + "outputTypes":["double"], + "transformationFeatures":["col1"], + "dropped_features":["data1"] + } + }, + "featuregroup": { + "type": "cachedFeaturegroupDTO", + "validation_type": "test_validation_type", + "created": "2022-08-01T11:07:55Z", + "creator": { + "email": "admin@hopsworks.ai", + "firstName": "Admin", + "lastName": "Admin", + "maxNumProjects": 0, + "numActiveProjects": 0, + "numRemainingProjects": 0, + "status": 0, + "testUser": false, + "tos": false, + "toursState": 0, + "twoFactor": false + }, + "description": "test_description", + "featurestoreId": 67, + "featurestoreName": "test_featurestore", + "id": 15, + "location": "hopsfs://10.0.2.15:8020/apps/hive/warehouse/test_featurestore.db/fg_test_1", + "name": "fg_test", + "statisticsConfig": { + "columns": [], + "correlations": false, + "enabled": true, + "exactUniqueness": false, + "histograms": false + }, + "version": 1, + "features": [ + { + "defaultValue": null, + "featureGroupId": 15, + "hudiPrecombineKey": true, + "name": "intt", + "onlineType": "int", + "partition": false, + "primary": true, + "type": "int" + }, + { + "defaultValue": null, + "featureGroupId": 15, + "hudiPrecombineKey": false, + "name": "stringt", + "onlineType": "varchar(1000)", + "partition": false, + "primary": false, + "type": "string" + } + ], + "onlineTopicName": "119_15_fg_test_1_onlinefs", + "onlineEnabled": true, + "timeTravelFormat": "HUDI" + }, + "feature_group_feature_name": "test_feature_group_feature_name", + "label": "test_label" + } + }, "get_fraud_online_training_dataset_features": { "response": [ { diff --git a/python/tests/test_builtin_transformation.py b/python/tests/test_builtin_transformation.py deleted file mode 100644 index 4a8a01af9c..0000000000 --- a/python/tests/test_builtin_transformation.py +++ /dev/null @@ -1,81 +0,0 @@ -# -# Copyright 2024 Hopsworks AB -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import hsfs.engine as engine -import pandas as pd -from hsfs.builtin_transformations import ( - min_max_scaler, -) -from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics -from hsfs.engine import python -from hsfs.hopsworks_udf import UDFType - - -class TestBuiltinTransformations: - @staticmethod - def validate_transformations_python( - transformed_outputs, expected_output, expected_col_names - ): - if isinstance(transformed_outputs, pd.Series): - assert transformed_outputs.name == expected_col_names - else: - assert all(transformed_outputs.columns == expected_col_names) - assert all(transformed_outputs.values == expected_output.values) - - def test_min_max_scaler(self): - test_dataframe = pd.DataFrame( - { - "col1": [1, 2, 3, 4], - "col2": [1.2, 3.4, 5.6, 9.1], - } - ) - statistics_df = test_dataframe.describe().to_dict() - - # Test case 1 : Integer column - min_max_scaler_col1 = min_max_scaler("col1") - min_max_scaler_col1.udf_type = UDFType.MODEL_DEPENDENT - - min_max_scaler_col1.transformation_statistics = [ - FeatureDescriptiveStatistics( - feature_name="col1", - min=statistics_df["col1"]["min"], - max=statistics_df["col1"]["max"], - ) - ] - - expected_df = (test_dataframe["col1"] - test_dataframe["col1"].min()) / ( - test_dataframe["col1"].max() - test_dataframe["col1"].min() - ) - - # Test with python engine - engine.set_instance(engine=python.Engine(), engine_type="python") - - transformed_df = min_max_scaler_col1.get_udf()(test_dataframe["col1"]) - TestBuiltinTransformations.validate_transformations_python( - transformed_outputs=transformed_df, - expected_output=expected_df, - expected_col_names="min_max_scaler_col1_", - ) - - # Test with spark engine - engine.set_instance(engine=python.Engine(), engine_type="python") - - transformed_df = min_max_scaler_col1.get_udf()(test_dataframe["col1"]) - TestBuiltinTransformations.validate_transformations_python( - transformed_outputs=transformed_df, - expected_output=expected_df, - expected_col_names="min_max_scaler_col1_", - ) diff --git a/python/tests/test_feature.py b/python/tests/test_feature.py index 8194035040..61ce72f288 100644 --- a/python/tests/test_feature.py +++ b/python/tests/test_feature.py @@ -36,6 +36,26 @@ def test_from_response_json(self, backend_fixtures): assert f.online_type == "int" assert f.default_value == "1" # default value should be specified as string assert f._feature_group_id == 15 + assert not f.on_demand + + def test_from_response_json_on_demand(self, backend_fixtures): + # Arrange + json = backend_fixtures["feature"]["get_on_demand"]["response"] + + # Act + f = feature.Feature.from_response_json(json) + + # Assert + assert f.name == "intt" + assert f.type == "int" + assert f.description == "test_description" + assert f.primary is True + assert f.partition is False + assert f.hudi_precombine_key is True + assert f.online_type == "int" + assert f.default_value == "1" # default value should be specified as string + assert f._feature_group_id == 15 + assert f.on_demand def test_from_response_json_basic_info(self, backend_fixtures): # Arrange diff --git a/python/tests/test_feature_group.py b/python/tests/test_feature_group.py index 56b870d23e..8e2ba67cdf 100644 --- a/python/tests/test_feature_group.py +++ b/python/tests/test_feature_group.py @@ -32,6 +32,7 @@ ) from hsfs.client.exceptions import FeatureStoreException, RestAPIError from hsfs.engine import python +from hsfs.hopsworks_udf import UDFType engine.init("python") @@ -145,7 +146,7 @@ def test_from_response_json_basic_info(self, backend_fixtures): assert fg._feature_store_id == 67 assert fg.description == "" assert fg.partition_key == [] - assert fg.primary_key == ['intt'] + assert fg.primary_key == ["intt"] assert fg.hudi_precombine_key is None assert fg._feature_store_name is None assert fg.created is None @@ -322,7 +323,7 @@ def test_constructor_with_list_event_time_for_compatibility( version=1, description="fg_description", event_time=["event_date"], - features=features + features=features, ) with pytest.raises(FeatureStoreException): util.verify_attribute_key_names(new_fg, False) @@ -885,3 +886,57 @@ def test_feature_group_save_expectation_suite_from_hopsworks_type( mock_print.call_args[0][0][:63] == "Updated expectation suite attached to Feature Group, edit it at" ) + + def test_from_response_json_transformation_functions(self, backend_fixtures): + # Arrange + json = backend_fixtures["feature_group"]["get_transformations"]["response"] + + # Act + fg = feature_group.FeatureGroup.from_response_json(json) + + # Assert + assert fg.name == "fg_test" + assert fg.version == 1 + assert fg._feature_store_id == 67 + assert fg.description == "test_description" + assert fg.partition_key == [] + assert fg.primary_key == ["intt"] + assert fg.hudi_precombine_key == "intt" + assert fg._feature_store_name == "test_featurestore" + assert fg.created == "2022-08-01T11:07:55Z" + assert len(fg.transformation_functions) == 2 + assert ( + fg.transformation_functions[0].hopsworks_udf.function_name == "add_one_fs" + ) + assert fg.transformation_functions[1].hopsworks_udf.function_name == "add_two" + assert ( + fg.transformation_functions[0].hopsworks_udf._function_source + == "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n" + ) + assert ( + fg.transformation_functions[1].hopsworks_udf._function_source + == "\n@udf(float)\ndef add_two(data1 : pd.Series):\n return data1 + 2\n" + ) + assert ( + fg.transformation_functions[0].hopsworks_udf.udf_type == UDFType.ON_DEMAND + ) + assert ( + fg.transformation_functions[1].hopsworks_udf.udf_type == UDFType.ON_DEMAND + ) + assert isinstance(fg.creator, user.User) + assert fg.id == 15 + assert len(fg.features) == 2 + assert isinstance(fg.features[0], feature.Feature) + assert ( + fg.location + == "hopsfs://10.0.2.15:8020/apps/hive/warehouse/test_featurestore.db/fg_test_1" + ) + assert fg.online_enabled is True + assert fg.time_travel_format == "HUDI" + assert isinstance(fg.statistics_config, statistics_config.StatisticsConfig) + assert fg._online_topic_name == "119_15_fg_test_1_onlinefs" + assert fg.event_time is None + assert fg.stream is False + assert ( + fg.expectation_suite.expectation_suite_name == "test_expectation_suite_name" + ) diff --git a/python/tests/test_feature_view.py b/python/tests/test_feature_view.py index a45093126b..57aa5c1b93 100644 --- a/python/tests/test_feature_view.py +++ b/python/tests/test_feature_view.py @@ -18,7 +18,7 @@ from hsfs import feature_view, training_dataset_feature from hsfs.constructor import fs_query, query from hsfs.feature_store import FeatureStore -from hsfs.hopsworks_udf import udf +from hsfs.hopsworks_udf import UDFType, udf class TestFeatureView: @@ -106,6 +106,15 @@ def test_from_response_json_transformation_function(self, mocker, backend_fixtur fv.transformation_functions[1].hopsworks_udf._function_source == "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n" ) + assert ( + fv.transformation_functions[0].hopsworks_udf.udf_type + == UDFType.MODEL_DEPENDENT + ) + assert ( + fv.transformation_functions[1].hopsworks_udf.udf_type + == UDFType.MODEL_DEPENDENT + ) + assert len(fv.schema) == 2 assert isinstance(fv.schema[0], training_dataset_feature.TrainingDatasetFeature) diff --git a/python/tests/test_hopswork_udf.py b/python/tests/test_hopswork_udf.py index fe9531b751..dfef840067 100644 --- a/python/tests/test_hopswork_udf.py +++ b/python/tests/test_hopswork_udf.py @@ -336,6 +336,21 @@ def test_func(col1): test_func.udf_type = UDFType.ON_DEMAND assert test_func._get_output_column_names() == ["test_func"] + def test_generate_output_column_names_one_argument_one_output_type_prefix(self): + @udf(int) + def test_func(col1): + return col1 + 1 + + test_func._feature_name_prefix = "prefix_" + + test_func.udf_type = UDFType.MODEL_DEPENDENT + assert test_func._get_output_column_names() == ["test_func_prefix_col1_"] + assert test_func.output_column_names == ["prefix_test_func_prefix_col1_"] + + test_func.udf_type = UDFType.ON_DEMAND + assert test_func._get_output_column_names() == ["test_func"] + assert test_func.output_column_names == ["prefix_test_func"] + def test_generate_output_column_names_multiple_argument_one_output_type(self): @udf(int) def test_func(col1, col2, col3): @@ -346,6 +361,26 @@ def test_func(col1, col2, col3): test_func.udf_type = UDFType.ON_DEMAND assert test_func._get_output_column_names() == ["test_func"] + def test_generate_output_column_names_multiple_argument_one_output_type_prefix( + self, + ): + @udf(int) + def test_func(col1, col2, col3): + return col1 + 1 + + test_func._feature_name_prefix = "prefix_" + + test_func.udf_type = UDFType.MODEL_DEPENDENT + assert test_func._get_output_column_names() == [ + "test_func_prefix_col1_prefix_col2_prefix_col3_" + ] + assert test_func.output_column_names == [ + "prefix_test_func_prefix_col1_prefix_col2_prefix_col3_" + ] + test_func.udf_type = UDFType.ON_DEMAND + assert test_func._get_output_column_names() == ["test_func"] + assert test_func.output_column_names == ["prefix_test_func"] + def test_generate_output_column_names_single_argument_multiple_output_type(self): @udf([int, float, int]) def test_func(col1): @@ -360,6 +395,29 @@ def test_func(col1): "test_func_col1_2", ] + def test_generate_output_column_names_single_argument_multiple_output_type_prefix( + self, + ): + @udf([int, float, int]) + def test_func(col1): + return pd.DataFrame( + {"col1": [col1 + 1], "col2": [col1 + 1], "col3": [col1 + 1]} + ) + + test_func._feature_name_prefix = "prefix_" + + test_func.udf_type = UDFType.MODEL_DEPENDENT + assert test_func._get_output_column_names() == [ + "test_func_prefix_col1_0", + "test_func_prefix_col1_1", + "test_func_prefix_col1_2", + ] + assert test_func.output_column_names == [ + "prefix_test_func_prefix_col1_0", + "prefix_test_func_prefix_col1_1", + "prefix_test_func_prefix_col1_2", + ] + def test_generate_output_column_names_multiple_argument_multiple_output_type(self): @udf([int, float, int]) def test_func(col1, col2, col3): @@ -374,6 +432,91 @@ def test_func(col1, col2, col3): "test_func_col1_col2_col3_2", ] + def test_generate_output_column_names_multiple_argument_multiple_output_type_prefix( + self, + ): + @udf([int, float, int]) + def test_func(col1, col2, col3): + return pd.DataFrame( + {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]} + ) + + test_func._feature_name_prefix = "prefix_" + + test_func.udf_type = UDFType.MODEL_DEPENDENT + assert test_func._get_output_column_names() == [ + "test_func_prefix_col1_prefix_col2_prefix_col3_0", + "test_func_prefix_col1_prefix_col2_prefix_col3_1", + "test_func_prefix_col1_prefix_col2_prefix_col3_2", + ] + assert test_func.output_column_names == [ + "prefix_test_func_prefix_col1_prefix_col2_prefix_col3_0", + "prefix_test_func_prefix_col1_prefix_col2_prefix_col3_1", + "prefix_test_func_prefix_col1_prefix_col2_prefix_col3_2", + ] + + def test_drop_features_one_element(self): + @udf([int, float, int], drop="col1") + def test_func(col1, col2, col3): + return pd.DataFrame( + {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]} + ) + + test_func.udf_type = UDFType.MODEL_DEPENDENT + + assert test_func.dropped_features == ["col1"] + + def test_drop_features_one_element_prefix(self): + @udf([int, float, int], drop="col1") + def test_func(col1, col2, col3): + return pd.DataFrame( + {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]} + ) + + test_func._feature_name_prefix = "prefix_" + test_func.udf_type = UDFType.MODEL_DEPENDENT + + assert test_func._dropped_features == ["col1"] + assert test_func.dropped_features == ["prefix_col1"] + + def test_drop_features_multiple_element(self): + @udf([int, float, int], drop=["col1", "col2"]) + def test_func(col1, col2, col3): + return pd.DataFrame( + {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]} + ) + + test_func.udf_type = UDFType.MODEL_DEPENDENT + + assert test_func.dropped_features == ["col1", "col2"] + + def test_drop_features_multiple_element_prefix(self): + @udf([int, float, int], drop=["col1", "col2"]) + def test_func(col1, col2, col3): + return pd.DataFrame( + {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]} + ) + + test_func._feature_name_prefix = "prefix_" + test_func.udf_type = UDFType.MODEL_DEPENDENT + + assert test_func._dropped_features == ["col1", "col2"] + assert test_func.dropped_features == ["prefix_col1", "prefix_col2"] + + def test_drop_features_invalid(self): + with pytest.raises(FeatureStoreException) as exp: + + @udf([int, float, int], drop=["col1", "invalid_col"]) + def test_func(col1, col2, col3): + return pd.DataFrame( + {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]} + ) + + assert ( + str(exp.value) + == "Cannot drop features 'invalid_col' as they are not features given as arguments in the defined UDF." + ) + def test_create_pandas_udf_return_schema_from_list_one_output_type(self): @udf(int) def test_func(col1): @@ -460,6 +603,16 @@ def test_func(col1): assert test_func("new_feature").transformation_features == ["new_feature"] assert test_func("new_feature").statistics_features == [] + # Test with prefix + test_func._feature_name_prefix = "prefix_" + assert test_func.transformation_features == ["prefix_col1"] + assert test_func.statistics_features == [] + + assert test_func("new_feature").transformation_features == [ + "prefix_new_feature" + ] + assert test_func("new_feature").statistics_features == [] + def test_HopsworkUDf_call_one_argument_statistics(self): from hsfs.transformation_statistics import TransformationStatistics @@ -477,6 +630,18 @@ def test_func(col1, statistics=stats): assert test_func("new_feature").statistics_features == ["new_feature"] assert test_func("new_feature")._statistics_argument_names == ["col1"] + # Test with prefix + test_func._feature_name_prefix = "prefix_" + assert test_func.transformation_features == ["prefix_col1"] + assert test_func.statistics_features == ["col1"] + assert test_func._statistics_argument_names == ["col1"] + + assert test_func("new_feature").transformation_features == [ + "prefix_new_feature" + ] + assert test_func("new_feature").statistics_features == ["new_feature"] + assert test_func("new_feature")._statistics_argument_names == ["col1"] + def test_HopsworkUDf_call_multiple_argument_statistics(self): from hsfs.transformation_statistics import TransformationStatistics @@ -495,3 +660,121 @@ def test_func(col1, col2, col3, statistics=stats): "col1", "col3", ] + + def test_validate_and_convert_drop_features(self): + dropped_features = "feature1" + transformation_feature = ["feature1", "feature2"] + feature_name_prefix = None + + dropped_features = HopsworksUdf._validate_and_convert_drop_features( + dropped_features, transformation_feature, feature_name_prefix + ) + + assert dropped_features == ["feature1"] + + def test_validate_and_convert_drop_features_dropped_list(self): + dropped_features = ["feature1", "feature2"] + transformation_feature = ["feature1", "feature2", "feature3"] + feature_name_prefix = None + + dropped_features = HopsworksUdf._validate_and_convert_drop_features( + dropped_features, transformation_feature, feature_name_prefix + ) + + assert dropped_features == ["feature1", "feature2"] + + def test_validate_and_convert_drop_features_dropped_invalid(self): + dropped_features = "feature4" + transformation_feature = ["feature1", "feature2", "feature3"] + feature_name_prefix = None + + with pytest.raises(FeatureStoreException) as exp: + HopsworksUdf._validate_and_convert_drop_features( + dropped_features, transformation_feature, feature_name_prefix + ) + + assert ( + str(exp.value) + == "Cannot drop features 'feature4' as they are not features given as arguments in the defined UDF." + ) + + def test_validate_and_convert_drop_features_dropped_invalid_list(self): + dropped_features = ["feature4", "feature5"] + transformation_feature = ["feature1", "feature2", "feature3"] + feature_name_prefix = None + + with pytest.raises(FeatureStoreException) as exp: + HopsworksUdf._validate_and_convert_drop_features( + dropped_features, transformation_feature, feature_name_prefix + ) + + assert ( + str(exp.value) + == "Cannot drop features 'feature4', 'feature5' as they are not features given as arguments in the defined UDF." + ) + + def test_validate_and_convert_drop_features_dropped_list_prefix(self): + dropped_features = ["feature1", "feature2"] + transformation_feature = ["test_feature1", "test_feature2", "test_feature3"] + feature_name_prefix = "test_" + + dropped_features = HopsworksUdf._validate_and_convert_drop_features( + dropped_features, transformation_feature, feature_name_prefix + ) + + assert dropped_features == ["feature1", "feature2"] + + def test_validate_and_convert_drop_features_dropped_prefix_invalid(self): + dropped_features = ["feature1", "feature2"] + transformation_feature = ["feature1", "feature2", "feature3"] + feature_name_prefix = "test_" + + with pytest.raises(FeatureStoreException) as exp: + HopsworksUdf._validate_and_convert_drop_features( + dropped_features, transformation_feature, feature_name_prefix + ) + + assert ( + str(exp.value) + == "Cannot drop features 'test_feature1', 'test_feature2' as they are not features given as arguments in the defined UDF." + ) + + def test_validate_udf_type_None(self): + @udf(int) + def test_func(col1): + return col1 + 1 + + with pytest.raises(FeatureStoreException) as exe: + test_func._validate_udf_type() + + assert str(exe.value) == "UDF Type cannot be None" + + def test_validate_udf_type_on_demand_multiple_output(self): + @udf([int, float]) + def test_func(col1, col2): + return pd.DataFrame({"out1": col1 + 1, "out2": col2 + 2}) + + with pytest.raises(FeatureStoreException) as exe: + test_func.udf_type = UDFType.ON_DEMAND + + assert ( + str(exe.value) + == "On-Demand Transformation functions can only return one column as output" + ) + + def test_validate_udf_type_on_demand_statistics(self): + from hsfs.transformation_statistics import TransformationStatistics + + stats = TransformationStatistics("col1") + + @udf(int) + def test_func(col1, statistics=stats): + return col1 + statistics.col1.mean + + with pytest.raises(FeatureStoreException) as exe: + test_func.udf_type = UDFType.ON_DEMAND + + assert ( + str(exe.value) + == "On-Demand Transformation functions cannot use statistics, please remove statistics parameters from the functions" + ) diff --git a/python/tests/test_training_dataset_feature.py b/python/tests/test_training_dataset_feature.py index dc5af26112..81c7fd6d14 100644 --- a/python/tests/test_training_dataset_feature.py +++ b/python/tests/test_training_dataset_feature.py @@ -16,6 +16,7 @@ from hsfs import feature_group, training_dataset_feature +from hsfs.hopsworks_udf import UDFType class TestTrainingDatasetFeature: @@ -38,6 +39,40 @@ def test_from_response_json(self, backend_fixtures): ) assert td_feature.label == "test_label" + def test_from_response_json_on_demand_transformation(self, backend_fixtures): + # Arrange + json = backend_fixtures["training_dataset_feature"]["get_transformations"][ + "response" + ] + + # Act + td_feature = training_dataset_feature.TrainingDatasetFeature.from_response_json( + json + ) + + # Assert + assert td_feature.name == "test_name" + assert td_feature.type == "test_type" + assert td_feature.index == "test_index" + assert ( + td_feature.on_demand_transformation_function.hopsworks_udf.function_name + == "add_one_fs" + ) + + assert ( + td_feature.on_demand_transformation_function.hopsworks_udf._function_source + == "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n" + ) + assert ( + td_feature.on_demand_transformation_function.hopsworks_udf.udf_type + == UDFType.ON_DEMAND + ) + assert isinstance(td_feature._feature_group, feature_group.FeatureGroup) + assert ( + td_feature._feature_group_feature_name == "test_feature_group_feature_name" + ) + assert td_feature.label == "test_label" + def test_from_response_json_basic_info(self, backend_fixtures): # Arrange json = backend_fixtures["training_dataset_feature"]["get_basic_info"][ From 99001d22dd380765ce20bbdc08d8a41e43c406f3 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Thu, 4 Jul 2024 12:24:46 +0200 Subject: [PATCH 56/58] adding documentation --- .../core/transformation_function_engine.py | 9 ++++++ python/hsfs/feature_store.py | 32 ++++++++++++++++--- python/hsfs/feature_view.py | 6 ++-- python/hsfs/hopsworks_udf.py | 26 ++++++++++++--- 4 files changed, 59 insertions(+), 14 deletions(-) diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py index 6bdbff13c9..4ab8c6a166 100644 --- a/python/hsfs/core/transformation_function_engine.py +++ b/python/hsfs/core/transformation_function_engine.py @@ -147,6 +147,15 @@ def get_ready_to_use_transformation_fns( feature_view: feature_view.FeatureView, training_dataset_version: Optional[int] = None, ) -> List[transformation_function.TransformationFunction]: + """ + Function that updates statistics required for all transformation functions in the feature view based on training dataset version. + + # Arguments + feature_view `FeatureView`: The feature view in which the training data is being created. + training_dataset_version `TrainingDataset`: The training version used to update the statistics used in the transformation functions. + # Returns + `List[transformation_function.TransformationFunction]` : List of transformation functions. + """ # check if transformation functions require statistics is_stat_required = any( [ diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py index 4da096d80c..2ec47f312e 100644 --- a/python/hsfs/feature_store.py +++ b/python/hsfs/feature_store.py @@ -521,13 +521,26 @@ def create_feature_group( # connect to the Feature Store fs = ... + # define the on-demand transformation functions + @udf(int) + def plus_one(value): + return value + 1 + + @udf(int) + def plus_two(value): + return value + 2 + + # construct list of "transformation functions" on features + transformation_functions = [plus_one("feature1"), plus_two("feature2"))] + fg = fs.create_feature_group( name='air_quality', description='Air Quality characteristics of each day', version=1, primary_key=['city','date'], online_enabled=True, - event_time='date' + event_time='date', + transformation_functions=transformation_functions ) ``` @@ -595,7 +608,9 @@ def create_feature_group( defaults to using project topic. notification_topic_name: Optionally, define the name of the topic used for sending notifications when entries are inserted or updated on the online feature store. If left undefined no notifications are sent. - transformation_functions: A list of Hopsworks UDF's. Defaults to `None`, no transformations. + transformation_functions: On-Demand Transformation functions attached to the feature group. + It can be a list of list of user defined functions defined using the hopsworks `@udf` decorator. + Defaults to `None`, no transformations. # Returns `FeatureGroup`. The feature group metadata object. @@ -669,6 +684,7 @@ def get_or_create_feature_group( primary_key=["day", "area"], online_enabled=True, event_time="timestamp", + transformation_functions=transformation_functions, ) ``` @@ -734,7 +750,9 @@ def get_or_create_feature_group( defaults to using project topic. notification_topic_name: Optionally, define the name of the topic used for sending notifications when entries are inserted or updated on the online feature store. If left undefined no notifications are sent. - transformation_functions: A list of Hopsworks UDF's. Defaults to `None`, no transformations. + transformation_functions: On-Demand Transformation functions attached to the feature group. + It can be a list of list of user defined functions defined using the hopsworks `@udf` decorator. + Defaults to `None`, no transformations. # Returns `FeatureGroup`. The feature group metadata object. @@ -1543,7 +1561,9 @@ def plus_one(value): Training helper columns can be optionally fetched with training data. For more details see documentation for feature view's get training data methods. Defaults to `[], no training helper columns. - transformation_functions: A list of Hopsworks UDF's. Defaults to `None`, no transformations. + transformation_functions: Model Dependent Transformation functions attached to the feature view. + It can be a list of list of user defined functions defined using the hopsworks `@udf` decorator. + Defaults to `None`, no transformations. # Returns: `FeatureView`: The feature view metadata object. @@ -1618,7 +1638,9 @@ def get_or_create_feature_view( Training helper columns can be optionally fetched with training data. For more details see documentation for feature view's get training data methods. Defaults to `[], no training helper columns. - transformation_functions: A list of Hopsworks UDF's. Defaults to `None`, no transformations. + transformation_functions: Model Dependent Transformation functions attached to the feature view. + It can be a list of list of user defined functions defined using the hopsworks `@udf` decorator. + Defaults to `None`, no transformations. # Returns: `FeatureView`: The feature view metadata object. diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py index f2f5019160..0045ecd713 100644 --- a/python/hsfs/feature_view.py +++ b/python/hsfs/feature_view.py @@ -568,7 +568,7 @@ def get_feature_vector( force_sql_client: boolean, defaults to False. If set to True, reads from online feature store using the SQL client if initialised. allow_missing: Setting to `True` returns feature vectors with missing values. - request_parameters: Request parameters required by on-demand transformation functions. + request_parameters: Request parameters required by on-demand transformation functions to compute on-demand features present in the feature view. # Returns `list`, `pd.DataFrame`, `polars.DataFrame` or `np.ndarray` if `return type` is set to `"list"`, `"pandas"`, `"polars"` or `"numpy"` @@ -678,6 +678,7 @@ def get_feature_vectors( force_rest_client: boolean, defaults to False. If set to True, reads from online feature store using the REST client if initialised. allow_missing: Setting to `True` returns feature vectors with missing values. + request_parameters: Request parameters required by on-demand transformation functions to compute on-demand features present in the feature view. # Returns `List[list]`, `pd.DataFrame`, `polars.DataFrame` or `np.ndarray` if `return type` is set to `"list", `"pandas"`,`"polars"` or `"numpy"` @@ -859,9 +860,6 @@ def find_neighbors( the number of results returned may be less than k. Try using a large value of k and extract the top k items from the results if needed. - !!! warning "Duplicate column error in Polars" - If the feature view has duplicate column names, attempting to create a polars DataFrame - will raise an error. To avoid this, set `return_type` to `"list"` or `"pandas"`. # Arguments embedding: The target embedding for which neighbors are to be found. diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index 10e8135293..f75c9f861e 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -33,6 +33,10 @@ class UDFType(Enum): + """ + Class that store the possible types of transformation functions. + """ + MODEL_DEPENDENT = "model_dependent" ON_DEMAND = "on_demand" @@ -111,11 +115,14 @@ class HopsworksUdf: Attributes ---------- - output_type (List[str]) : Output types of the columns returned from the UDF. function_name (str) : Name of the UDF - statistics_required (bool) : True if statistics is required for any of the parameters of the UDF. - transformation_statistics (Dict[str, FeatureDescriptiveStatistics]): Dictionary that maps the statistics_argument name in the function to the actual statistics variable. + udf_type (UDFType): Type of the UDF can be either \"model dependent\" or \"on-demand\". + return_types (List[str]): The data types of the columns returned from the UDF. transformation_features (List[str]) : List of feature names to which the transformation function would be applied. + output_column_names (List[str]): Column names of the DataFrame returned after application of the transformation function. + dropped_features (List[str]): List of features that will be dropped after the UDF is applied. + transformation_statistics (Dict[str, FeatureDescriptiveStatistics]): Dictionary that maps the statistics_argument name in the function to the actual statistics variable. + statistics_required (bool) : True if statistics is required for any of the parameters of the UDF. statistics_features (List[str]) : List of feature names that requires statistics. """ @@ -715,6 +722,12 @@ def from_response_json( return hopsworks_udf def _validate_udf_type(self): + """ + Function that returns validates if the defined transformation function can be used for the specified UDF type. + + # Raises + `hsfs.client.exceptions.FeatureStoreException` : If the UDF Type is None or if statistics or multiple columns has been output by a on-demand transformation function + """ if self.udf_type is None: raise FeatureStoreException("UDF Type cannot be None") @@ -785,7 +798,7 @@ def transformation_features(self) -> List[str]: @property def statistics_features(self) -> List[str]: """ - list of feature names that require statistics + List of feature names that require statistics """ return [ transformation_feature.feature_name @@ -806,7 +819,7 @@ def _statistics_argument_mapping(self) -> Dict[str, str]: @property def _statistics_argument_names(self) -> List[str]: """ - list of argument names required for statistics + List of argument names required for statistics """ return [ transformation_feature.statistic_argument_name @@ -827,6 +840,9 @@ def udf_type(self, udf_type: UDFType) -> None: @property def dropped_features(self) -> List[str]: + """ + List of features that will be dropped after the UDF is applied. + """ if self._feature_name_prefix: return [ self._feature_name_prefix + dropped_feature From c71af3bb16db654876a54da7530a448b3c07dc20 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Mon, 8 Jul 2024 11:09:40 +0200 Subject: [PATCH 57/58] adopting changes in backend for UI --- python/hsfs/engine/python.py | 7 +- python/hsfs/engine/spark.py | 7 +- python/hsfs/feature_group.py | 34 ++++---- python/hsfs/feature_view.py | 34 ++++---- python/hsfs/hopsworks_udf.py | 85 +++++++++++++------ python/hsfs/transformation_function.py | 4 +- ...t_python_spark_transformation_functions.py | 6 +- .../fixtures/feature_group_fixtures.json | 4 +- .../tests/fixtures/feature_view_fixtures.json | 4 +- .../training_dataset_feature_fixtures.json | 2 +- .../transformation_function_fixtures.json | 14 +-- 11 files changed, 123 insertions(+), 78 deletions(-) diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py index 9c2a4ca279..b0efd7be0e 100644 --- a/python/hsfs/engine/python.py +++ b/python/hsfs/engine/python.py @@ -827,7 +827,8 @@ def parse_schema_feature_group( on_demand=True, ) ) - dropped_features.extend(tf.hopsworks_udf.dropped_features) + if tf.hopsworks_udf.dropped_features: + dropped_features.extend(tf.hopsworks_udf.dropped_features) for feat_name in arrow_schema.names: name = util.autofix_feature_name(feat_name) try: @@ -1364,8 +1365,8 @@ def _apply_transformation_function( raise FeatureStoreException( f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly." ) - - dropped_features.update(tf.hopsworks_udf.dropped_features) + if tf.hopsworks_udf.dropped_features: + dropped_features.update(tf.hopsworks_udf.dropped_features) dataset = pd.concat( [ dataset, diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py index 60f5f14854..322e9e993a 100644 --- a/python/hsfs/engine/spark.py +++ b/python/hsfs/engine/spark.py @@ -1145,7 +1145,8 @@ def parse_schema_feature_group( on_demand=True, ) ) - dropped_features.extend(tf.hopsworks_udf.dropped_features) + if tf.hopsworks_udf.dropped_features: + dropped_features.extend(tf.hopsworks_udf.dropped_features) using_hudi = time_travel_format == "HUDI" for feat in dataframe.schema: @@ -1290,8 +1291,8 @@ def _apply_transformation_function( raise FeatureStoreException( f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly." ) - - dropped_features.update(tf.hopsworks_udf.dropped_features) + if tf.hopsworks_udf.dropped_features: + dropped_features.update(tf.hopsworks_udf.dropped_features) pandas_udf = hopsworks_udf.get_udf() output_col_name = hopsworks_udf.output_column_names[0] diff --git a/python/hsfs/feature_group.py b/python/hsfs/feature_group.py index 0bbeb26552..8240f115e9 100644 --- a/python/hsfs/feature_group.py +++ b/python/hsfs/feature_group.py @@ -2135,21 +2135,25 @@ def __init__( self._writer: Optional[callable] = None # On-Demand Transformation Functions - self._transformation_functions: List[TransformationFunction] = ( - [ - TransformationFunction( - featurestore_id, - hopsworks_udf=transformation_function, - version=1, - transformation_type=UDFType.ON_DEMAND, - ) - if not isinstance(transformation_function, TransformationFunction) - else transformation_function - for transformation_function in transformation_functions - ] - if transformation_functions - else [] - ) + self._transformation_functions: List[TransformationFunction] = [] + + if transformation_functions: + for transformation_function in transformation_functions: + if not isinstance(transformation_function, TransformationFunction): + self._transformation_functions.append( + TransformationFunction( + featurestore_id, + hopsworks_udf=transformation_function, + version=1, + transformation_type=UDFType.ON_DEMAND, + ) + ) + else: + if not transformation_function.hopsworks_udf.udf_type: + transformation_function.hopsworks_udf.udf_type = ( + UDFType.ON_DEMAND + ) + self._transformation_functions.append(transformation_function) if self._transformation_functions: self._transformation_functions = ( diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py index 0045ecd713..fc9151ae94 100644 --- a/python/hsfs/feature_view.py +++ b/python/hsfs/feature_view.py @@ -120,21 +120,25 @@ def __init__( training_helper_columns if training_helper_columns else [] ) - self._transformation_functions: List[TransformationFunction] = ( - [ - TransformationFunction( - self.featurestore_id, - hopsworks_udf=transformation_function, - version=1, - transformation_type=UDFType.MODEL_DEPENDENT, - ) - if not isinstance(transformation_function, TransformationFunction) - else transformation_function - for transformation_function in transformation_functions - ] - if transformation_functions - else [] - ) + self._transformation_functions: List[TransformationFunction] = [] + + if transformation_functions: + for transformation_function in transformation_functions: + if not isinstance(transformation_function, TransformationFunction): + self._transformation_functions.append( + TransformationFunction( + self.featurestore_id, + hopsworks_udf=transformation_function, + version=1, + transformation_type=UDFType.MODEL_DEPENDENT, + ) + ) + else: + if not transformation_function.hopsworks_udf.udf_type: + transformation_function.hopsworks_udf.udf_type = ( + UDFType.MODEL_DEPENDENT + ) + self._transformation_functions.append(transformation_function) if self._transformation_functions: self._transformation_functions = FeatureView._sort_transformation_functions( diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py index f75c9f861e..697eb06f38 100644 --- a/python/hsfs/hopsworks_udf.py +++ b/python/hsfs/hopsworks_udf.py @@ -75,7 +75,9 @@ def add_one(data1 : pd.Series): """ def wrapper(func: Callable) -> HopsworksUdf: - udf = HopsworksUdf(func=func, return_types=return_type, dropped_features=drop) + udf = HopsworksUdf( + func=func, return_types=return_type, dropped_argument_names=drop + ) return udf return wrapper @@ -143,7 +145,11 @@ def __init__( return_types: Union[List[type], type, List[str], str], name: Optional[str] = None, transformation_features: Optional[List[TransformationFeature]] = None, - dropped_features: Optional[List[str]] = None, + transformation_function_argument_names: Optional[ + List[TransformationFeature] + ] = None, + dropped_argument_names: Optional[List[str]] = None, + dropped_feature_names: Optional[List[str]] = None, feature_name_prefix: Optional[str] = None, ): self._return_types: List[str] = HopsworksUdf._validate_and_convert_output_types( @@ -162,24 +168,41 @@ def __init__( else func ) if not transformation_features: + # New transformation function being declared so extract source code from function self._transformation_features: List[TransformationFeature] = ( HopsworksUdf._extract_function_arguments(func) if not transformation_features else transformation_features ) + + self._transformation_function_argument_names = [ + feature.feature_name for feature in self._transformation_features + ] + + self._dropped_argument_names: List[str] = ( + HopsworksUdf._validate_and_convert_drop_features( + dropped_argument_names, + self.transformation_features, + feature_name_prefix, + ) + ) + self._dropped_features = self._dropped_argument_names else: self._transformation_features = transformation_features + self._transformation_function_argument_names = ( + transformation_function_argument_names + ) + self._dropped_argument_names = dropped_argument_names + self._dropped_features = ( + dropped_feature_names + if dropped_feature_names + else dropped_argument_names + ) self._formatted_function_source, self._module_imports = ( HopsworksUdf._format_source_code(self._function_source) ) - self._dropped_features: List[str] = ( - HopsworksUdf._validate_and_convert_drop_features( - dropped_features, self.transformation_features, feature_name_prefix - ) - ) - self._statistics: Optional[TransformationStatistics] = None self._udf_type: UDFType = None @@ -201,7 +224,7 @@ def _validate_and_convert_drop_features( `List[str]`: A list of features to be dropped. """ if not dropped_features: - return [] + return None dropped_features = ( [dropped_features] @@ -554,11 +577,16 @@ def __call__(self, *features: List[str]) -> "HopsworksUdf": f'Feature names provided must be string "{arg}" is not string' ) transformation_feature_name = self.transformation_features - index_dropped_features = [ - transformation_feature_name.index(dropped_feature) - for dropped_feature in self.dropped_features - ] - updated_dropped_features = [features[index] for index in index_dropped_features] + if self.dropped_features: + index_dropped_features = [ + transformation_feature_name.index(dropped_feature) + for dropped_feature in self.dropped_features + ] + updated_dropped_features = [ + features[index] for index in index_dropped_features + ] + else: + updated_dropped_features = None # Create a copy of the UDF to associate it with new feature names. udf = copy.deepcopy(self) @@ -601,6 +629,8 @@ def get_udf(self, force_python_udf: bool = False) -> Callable: # Returns `Callable`: Pandas UDF in the spark engine otherwise returns a python function for the UDF. """ + if self.udf_type is None: + raise FeatureStoreException("UDF Type cannot be None") if engine.get_type() in ["hive", "python", "training"] or force_python_udf: return self.hopsworksUdf_wrapper() @@ -623,7 +653,8 @@ def to_dict(self) -> Dict[str, Any]: "sourceCode": self._function_source, "outputTypes": self.return_types, "transformationFeatures": self.transformation_features, - "droppedFeatures": self.dropped_features, + "transformationFunctionArgumentNames": self._transformation_function_argument_names, + "droppedArgumentNames": self._dropped_argument_names, "statisticsArgumentNames": self._statistics_argument_names if self.statistics_required else None, @@ -663,12 +694,12 @@ def from_response_json( transformation_features = [ feature.strip() for feature in json_decamelized["transformation_features"] ] - dropped_features = ( + dropped_argument_names = ( [ dropped_feature.strip() - for dropped_feature in json_decamelized["dropped_features"] + for dropped_feature in json_decamelized["dropped_argument_names"] ] - if "dropped_features" in json_decamelized + if "dropped_argument_names" in json_decamelized else None ) statistics_features = ( @@ -687,11 +718,14 @@ def from_response_json( arg_list if not transformation_features else transformation_features ) - if dropped_features: - dropped_features = [ - transformation_features[arg_list.index(dropped_feature)] - for dropped_feature in dropped_features + dropped_feature_names = ( + [ + transformation_features[arg_list.index(dropped_argument_name)] + for dropped_argument_name in dropped_argument_names ] + if dropped_argument_names + else None + ) if statistics_features: transformation_features = [ @@ -714,7 +748,8 @@ def from_response_json( return_types=output_types, name=function_name, transformation_features=transformation_features, - dropped_features=dropped_features, + dropped_argument_names=dropped_argument_names, + dropped_feature_names=dropped_feature_names, feature_name_prefix=feature_name_prefix, ) @@ -728,8 +763,6 @@ def _validate_udf_type(self): # Raises `hsfs.client.exceptions.FeatureStoreException` : If the UDF Type is None or if statistics or multiple columns has been output by a on-demand transformation function """ - if self.udf_type is None: - raise FeatureStoreException("UDF Type cannot be None") if self._udf_type == UDFType.ON_DEMAND: if len(self.return_types) > 1: @@ -843,7 +876,7 @@ def dropped_features(self) -> List[str]: """ List of features that will be dropped after the UDF is applied. """ - if self._feature_name_prefix: + if self._feature_name_prefix and self._dropped_features: return [ self._feature_name_prefix + dropped_feature for dropped_feature in self._dropped_features diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py index 65535aa539..fe30047384 100644 --- a/python/hsfs/transformation_function.py +++ b/python/hsfs/transformation_function.py @@ -241,5 +241,7 @@ def __repr__(self): return ( f"Model-Dependent Transformation Function : {repr(self.hopsworks_udf)}" ) - else: + elif self.hopsworks_udf._udf_type == UDFType.ON_DEMAND: return f"On-Demand Transformation Function : {repr(self.hopsworks_udf)}" + else: + return f"Transformation Function : {repr(self.hopsworks_udf)}" diff --git a/python/tests/engine/test_python_spark_transformation_functions.py b/python/tests/engine/test_python_spark_transformation_functions.py index 71bb48cd05..8c29128641 100644 --- a/python/tests/engine/test_python_spark_transformation_functions.py +++ b/python/tests/engine/test_python_spark_transformation_functions.py @@ -161,7 +161,7 @@ def test_apply_builtin_minmax_from_backend(self, mocker): "transformationFeatures": [], "statisticsArgumentNames": ["feature"], "name": "min_max_scaler", - "droppedFeatures": ["feature"], + "droppedArgumentNames": ["feature"], } tf_fun = HopsworksUdf.from_response_json(udf_response) @@ -304,7 +304,7 @@ def test_apply_builtin_standard_scaler_from_backend(self, mocker): "transformationFeatures": [], "statisticsArgumentNames": ["feature"], "name": "standard_scaler", - "droppedFeatures": ["feature"], + "droppedArgumentNames": ["feature"], } tf_fun = HopsworksUdf.from_response_json(udf_response) @@ -451,7 +451,7 @@ def test_apply_builtin_robust_scaler_from_backend(self, mocker): "transformationFeatures": [], "statisticsArgumentNames": ["feature"], "name": "robust_scaler", - "droppedFeatures": ["feature"], + "droppedArgumentNames": ["feature"], } tf_fun = HopsworksUdf.from_response_json(udf_response) diff --git a/python/tests/fixtures/feature_group_fixtures.json b/python/tests/fixtures/feature_group_fixtures.json index c2394ed4cb..bc967508b0 100644 --- a/python/tests/fixtures/feature_group_fixtures.json +++ b/python/tests/fixtures/feature_group_fixtures.json @@ -695,7 +695,7 @@ "name": "add_two", "outputTypes":["double"], "transformationFeatures":["data"], - "dropped_features":["data1"] + "dropped_argument_names":["data1"] } }, { @@ -707,7 +707,7 @@ "name": "add_one_fs", "outputTypes":["double"], "transformationFeatures":["col1"], - "dropped_features":["data1"] + "dropped_argument_names":["data1"] } } ], diff --git a/python/tests/fixtures/feature_view_fixtures.json b/python/tests/fixtures/feature_view_fixtures.json index 1ad25dea36..260cffd0c9 100644 --- a/python/tests/fixtures/feature_view_fixtures.json +++ b/python/tests/fixtures/feature_view_fixtures.json @@ -935,7 +935,7 @@ "outputTypes":["double"], "transformationFeatures":["data"], "statisticsArgumentNames":["data1"], - "dropped_features":["data1"] + "dropped_argument_names":["data1"] } }, { @@ -947,7 +947,7 @@ "name": "add_one_fs", "outputTypes":["double"], "transformationFeatures":["col1"], - "dropped_features":["data1"] + "dropped_argument_names":["data1"] } } ], diff --git a/python/tests/fixtures/training_dataset_feature_fixtures.json b/python/tests/fixtures/training_dataset_feature_fixtures.json index 27cd07f302..0ca85653c8 100644 --- a/python/tests/fixtures/training_dataset_feature_fixtures.json +++ b/python/tests/fixtures/training_dataset_feature_fixtures.json @@ -79,7 +79,7 @@ "name": "add_one_fs", "outputTypes":["double"], "transformationFeatures":["col1"], - "dropped_features":["data1"] + "dropped_argument_names":["data1"] } }, "featuregroup": { diff --git a/python/tests/fixtures/transformation_function_fixtures.json b/python/tests/fixtures/transformation_function_fixtures.json index 036eb2fac7..2604d5d75e 100644 --- a/python/tests/fixtures/transformation_function_fixtures.json +++ b/python/tests/fixtures/transformation_function_fixtures.json @@ -9,7 +9,7 @@ "name": "add_one_fs", "outputTypes":["double"], "transformationFeatures":["col1"], - "dropped_features":["data1"] + "dropped_argument_names":["data1"] } } }, @@ -24,7 +24,7 @@ "outputTypes":["double"], "transformationFeatures":["data"], "statisticsArgumentNames":["data1"], - "dropped_features":["data1"] + "dropped_argument_names":["data1"] } } }, @@ -39,7 +39,7 @@ "outputTypes":["string"], "transformationFeatures":["feature1", "feature2", "feature3"], "statisticsArgumentNames":["data1", "data2"], - "dropped_features":["data1", "data2", "data3"] + "dropped_argument_names":["data1", "data2", "data3"] } } }, @@ -54,7 +54,7 @@ "outputTypes":["string", "double"], "transformationFeatures":["feature1", "feature2", "feature3"], "statisticsArgumentNames":["data1", "data2"], - "dropped_features":["data1", "data2", "data3"] + "dropped_argument_names":["data1", "data2", "data3"] } } }, @@ -72,7 +72,7 @@ "outputTypes":["double"], "transformationFeatures":["data"], "statisticsArgumentNames":["data1"], - "dropped_features":["data1"] + "dropped_argument_names":["data1"] } }, { @@ -84,7 +84,7 @@ "name": "add_one_fs", "outputTypes":["double"], "transformationFeatures":["col1"], - "dropped_features":["data1"] + "dropped_argument_names":["data1"] } } ] @@ -104,7 +104,7 @@ "outputTypes":["double"], "transformationFeatures":["data"], "statisticsArgumentNames":["data1"], - "dropped_features":["data1"] + "dropped_argument_names":["data1"] } } ] From 4681f3302f064223dd05304ee37d80fb6b5269dc Mon Sep 17 00:00:00 2001 From: manu-sj Date: Tue, 9 Jul 2024 06:56:36 +0200 Subject: [PATCH 58/58] fixing unit tests --- python/tests/test_hopswork_udf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/tests/test_hopswork_udf.py b/python/tests/test_hopswork_udf.py index dfef840067..06ffb19742 100644 --- a/python/tests/test_hopswork_udf.py +++ b/python/tests/test_hopswork_udf.py @@ -746,6 +746,7 @@ def test_func(col1): with pytest.raises(FeatureStoreException) as exe: test_func._validate_udf_type() + test_func.get_udf() assert str(exe.value) == "UDF Type cannot be None"