diff --git a/python/hsfs/builtin_transformations.py b/python/hsfs/builtin_transformations.py new file mode 100644 index 0000000000..ae24cd4274 --- /dev/null +++ b/python/hsfs/builtin_transformations.py @@ -0,0 +1,66 @@ +# +# Copyright 2024 Hopsworks AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy as np +import pandas as pd +from hsfs.hopsworks_udf import udf +from hsfs.transformation_statistics import TransformationStatistics + + +feature_statistics = TransformationStatistics("feature") + + +@udf(float, drop=["feature"]) +def min_max_scaler(feature: pd.Series, statistics=feature_statistics) -> pd.Series: + return (feature - statistics.feature.min) / ( + statistics.feature.max - statistics.feature.min + ) + + +@udf(float, drop=["feature"]) +def standard_scaler(feature: pd.Series, statistics=feature_statistics) -> pd.Series: + return (feature - statistics.feature.mean) / statistics.feature.stddev + + +@udf(float, drop=["feature"]) +def robust_scaler(feature: pd.Series, statistics=feature_statistics) -> pd.Series: + return (feature - statistics.feature.percentiles[49]) / ( + statistics.feature.percentiles[74] - statistics.feature.percentiles[24] + ) + + +@udf(int, drop=["feature"]) +def label_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Series: + unique_data = sorted( + [value for value in statistics.feature.extended_statistics["unique_values"]] + ) + value_to_index = {value: index for index, value in enumerate(unique_data)} + return pd.Series( + [value_to_index[data] if not pd.isna(data) else np.nan for data in feature] + ) + + +@udf(bool, drop=["feature"]) +def one_hot_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Series: + unique_data = [ + value for value in statistics.feature.extended_statistics["unique_values"] + ] + one_hot = pd.get_dummies(feature, dtype="bool") + for data in unique_data: + if data not in one_hot: + one_hot[data] = False + # Sorting by columns so as to maintain consistency in column order. + return one_hot.reindex(sorted(one_hot.columns), axis=1) diff --git a/python/hsfs/constructor/query.py b/python/hsfs/constructor/query.py index e305e8ca5a..5e527b6f13 100644 --- a/python/hsfs/constructor/query.py +++ b/python/hsfs/constructor/query.py @@ -59,7 +59,7 @@ def __init__( fg_mod.ExternalFeatureGroup, fg_mod.SpineGroup, ], - left_features: List[Union[str, "Feature"]], + left_features: List[Union[str, "Feature", Dict]], feature_store_name: Optional[str] = None, feature_store_id: Optional[int] = None, left_feature_group_start_time: Optional[Union[str, int, date, datetime]] = None, diff --git a/python/hsfs/core/builtin_transformation_function.py b/python/hsfs/core/builtin_transformation_function.py deleted file mode 100644 index 7ef5b63555..0000000000 --- a/python/hsfs/core/builtin_transformation_function.py +++ /dev/null @@ -1,107 +0,0 @@ -# -# Copyright 2021 Logical Clocks AB -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from __future__ import annotations - -from typing import List - -from hsfs.client.exceptions import FeatureStoreException -from hsfs.core import feature_descriptive_statistics as fds - - -class BuiltInTransformationFunction: - def __init__(self, method): - self._method = method.lower() - - @staticmethod - def min_max_scaler_stats( - feature_descriptive_stats: List[fds.FeatureDescriptiveStatistics], - feature_name: str, - ): - min_value = None - max_value = None - for stats in feature_descriptive_stats: - if stats.feature_name == feature_name: - if stats.feature_type not in ["Integral", "Fractional", "Decimal"]: - raise ValueError("Can't compute min_max_scaler for this type") - min_value = stats.min - max_value = stats.max - - if min_value is None or max_value is None: - raise FeatureStoreException( - "Feature {feature_name:} doesn't have minimum and/or maximum values computed. Thus can't use " - "min_max_scaler method".format(feature_name=feature_name) - ) - return min_value, max_value - - @staticmethod - def standard_scaler_stats( - feature_descriptive_stats: List[fds.FeatureDescriptiveStatistics], - feature_name: str, - ): - mean = None - std_dev = None - for stats in feature_descriptive_stats: - if stats.feature_name == feature_name: - if stats.feature_type not in ["Integral", "Fractional", "Decimal"]: - raise ValueError("Can't compute standard_scaler for this type") - mean = stats.mean - std_dev = stats.stddev - - if mean is None or std_dev is None: - raise FeatureStoreException( - "Feature {feature_name:} doesn't have mean and/or standard deviation computed. Thus can't use " - "standard_scaler method".format(feature_name=feature_name) - ) - return mean, std_dev - - @staticmethod - def robust_scaler_stats( - feature_descriptive_stats: List[fds.FeatureDescriptiveStatistics], - feature_name: str, - ): - percentiles = None - for stats in feature_descriptive_stats: - if stats.feature_name == feature_name: - if stats.feature_type not in ["Integral", "Fractional", "Decimal"]: - raise ValueError("Can't compute robust_scaler for this type") - if stats.percentiles is not None and len(stats.percentiles) > 0: - percentiles = stats.percentiles - - if percentiles is None: - raise FeatureStoreException( - "Feature {feature_name:} doesn't have mean and/or standard deviation computed. Thus can't use " - "standard_scaler method".format(feature_name=feature_name) - ) - return percentiles - - @staticmethod - def encoder_stats( - feature_descriptive_stats: List[fds.FeatureDescriptiveStatistics], - feature_name: str, - ): - for stats in feature_descriptive_stats: - if ( - stats.feature_name == feature_name - and stats.extended_statistics is not None - and "unique_values" in stats.extended_statistics - ): - unique_data = [ - value for value in stats.extended_statistics["unique_values"] - ] - value_to_index = dict( - (value, index) for index, value in enumerate(unique_data) - ) - return value_to_index diff --git a/python/hsfs/core/feature_group_api.py b/python/hsfs/core/feature_group_api.py index 11fdbbbdc6..c6b0a1a70f 100644 --- a/python/hsfs/core/feature_group_api.py +++ b/python/hsfs/core/feature_group_api.py @@ -51,6 +51,9 @@ def save( feature_group_instance.feature_store_id, "featuregroups", ] + query_params = { + "expand": ["features", "expectationsuite", "transformationfunctions"] + } headers = {"content-type": "application/json"} feature_group_object = feature_group_instance.update_from_response_json( _client._send_request( @@ -58,6 +61,7 @@ def save( path_params, headers=headers, data=feature_group_instance.json(), + query_params=query_params, ), ) return feature_group_object @@ -93,7 +97,11 @@ def get( "featuregroups", name, ] - query_params = None if version is None else {"version": version} + query_params = { + "expand": ["features", "expectationsuite", "transformationfunctions"] + } + if version is not None: + query_params["version"] = version fg_objs = [] # In principle unique names are enforced across fg type and this should therefore @@ -157,8 +165,10 @@ def get_by_id( "featuregroups", feature_group_id, ] - - fg_json = _client._send_request("GET", path_params) + query_params = { + "expand": ["features", "expectationsuite", "transformationfunctions"] + } + fg_json = _client._send_request("GET", path_params, query_params) if ( fg_json["type"] == FeatureGroupApi.BACKEND_FG_STREAM or fg_json["type"] == FeatureGroupApi.BACKEND_FG_BATCH diff --git a/python/hsfs/core/feature_group_engine.py b/python/hsfs/core/feature_group_engine.py index 3e88805eda..010810f6cc 100644 --- a/python/hsfs/core/feature_group_engine.py +++ b/python/hsfs/core/feature_group_engine.py @@ -88,7 +88,9 @@ def insert( validation_options: dict = None, ): dataframe_features = engine.get_instance().parse_schema_feature_group( - feature_dataframe, feature_group.time_travel_format + feature_dataframe, + feature_group.time_travel_format, + feature_group.transformation_functions, ) util.validate_embedding_feature_type( feature_group.embedding_index, dataframe_features @@ -281,7 +283,9 @@ def insert_stream( ) dataframe_features = engine.get_instance().parse_schema_feature_group( - dataframe, feature_group.time_travel_format + dataframe, + feature_group.time_travel_format, + feature_group.transformation_functions, ) util.validate_embedding_feature_type( feature_group.embedding_index, dataframe_features diff --git a/python/hsfs/core/feature_view_api.py b/python/hsfs/core/feature_view_api.py index ed5a8468c3..50355f3d5f 100644 --- a/python/hsfs/core/feature_view_api.py +++ b/python/hsfs/core/feature_view_api.py @@ -17,12 +17,7 @@ from typing import List, Optional, Union -from hsfs import ( - client, - feature_view, - training_dataset, - transformation_function_attached, -) +from hsfs import client, feature_view, training_dataset from hsfs.client.exceptions import RestAPIError from hsfs.constructor import query, serving_prepared_statement from hsfs.core import explicit_provenance, job, training_dataset_job_conf @@ -78,13 +73,28 @@ def update(self, feature_view_obj: feature_view.FeatureView) -> None: data=feature_view_obj.json(), ) - def get_by_name(self, name: str) -> feature_view.FeatureView: + def get_by_name(self, name: str) -> List[feature_view.FeatureView]: + """ + Get a feature view from the backend using its name. + + # Arguments + name `str`: Name of the feature view. + + # Returns + `List[FeatureView]`: A list that contains all version of the feature view. + + # Raises + `RestAPIError`: If the feature view cannot be found from the backend. + `ValueError`: If the feature group associated with the feature view cannot be found. + """ path = self._base_path + [name] try: return [ feature_view.FeatureView.from_response_json(fv) for fv in self._client._send_request( - self._GET, path, {"expand": ["query", "features"]} + self._GET, + path, + {"expand": ["query", "features", "transformationfunctions"]}, )["items"] ] except RestAPIError as e: @@ -98,11 +108,27 @@ def get_by_name(self, name: str) -> feature_view.FeatureView: raise e def get_by_name_version(self, name: str, version: int) -> feature_view.FeatureView: + """ + Get a feature view form the backend using both name and version + + # Arguments + name `str`: Name of feature view. + version `version`: Version of the feature view. + + # Returns + `FeatureView` + + # Raises + `RestAPIError`: If the feature view cannot be found from the backend. + `ValueError`: If the feature group associated with the feature view cannot be found. + """ path = self._base_path + [name, self._VERSION, version] try: return feature_view.FeatureView.from_response_json( self._client._send_request( - self._GET, path, {"expand": ["query", "features"]} + self._GET, + path, + {"expand": ["query", "features", "transformationfunctions"]}, ) ) except RestAPIError as e: @@ -180,17 +206,6 @@ def get_serving_prepared_statement( self._client._send_request("GET", path, query_params, headers=headers) ) - def get_attached_transformation_fn( - self, name: str, version: int - ) -> Union[ - "transformation_function_attached.TransformationFunctionAttached", - List["transformation_function_attached.TransformationFunctionAttached"], - ]: - path = self._base_path + [name, self._VERSION, version, self._TRANSFORMATION] - return transformation_function_attached.TransformationFunctionAttached.from_response_json( - self._client._send_request("GET", path) - ) - def create_training_dataset( self, name: str, diff --git a/python/hsfs/core/feature_view_engine.py b/python/hsfs/core/feature_view_engine.py index dd49fa5e21..f85529163f 100644 --- a/python/hsfs/core/feature_view_engine.py +++ b/python/hsfs/core/feature_view_engine.py @@ -17,7 +17,7 @@ import datetime import warnings -from typing import Optional +from typing import List, Optional, Union from hsfs import ( client, @@ -37,7 +37,6 @@ statistics_engine, tags_api, training_dataset_engine, - transformation_function_engine, ) from hsfs.training_dataset_split import TrainingDatasetSplit @@ -53,11 +52,6 @@ def __init__(self, feature_store_id): self._feature_view_api = feature_view_api.FeatureViewApi(feature_store_id) self._tags_api = tags_api.TagsApi(feature_store_id, self.ENTITY_TYPE) - self._transformation_function_engine = ( - transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - ) self._td_code_engine = code_engine.CodeEngine( feature_store_id, self._TRAINING_DATA_API_PATH ) @@ -69,7 +63,18 @@ def __init__(self, feature_store_id): ) self._query_constructor_api = query_constructor_api.QueryConstructorApi() - def save(self, feature_view_obj): + def save( + self, feature_view_obj: feature_view.FeatureView + ) -> feature_view.FeatureView: + """ + Save a feature view to the backend. + + # Arguments + feature_view_obj `FeatureView` : The feature view object to be saved. + + # Returns + `FeatureView` : Updated feature view that has the ID used to save in the backend. + """ if feature_view_obj.query.is_time_travel(): warnings.warn( "`as_of` argument in the `Query` will be ignored because" @@ -120,41 +125,53 @@ def save(self, feature_view_obj): ) ) - self._transformation_function_engine.attach_transformation_fn(feature_view_obj) updated_fv = self._feature_view_api.post(feature_view_obj) - self.attach_transformation_function(updated_fv) print( "Feature view created successfully, explore it at \n" + self._get_feature_view_url(updated_fv) ) return updated_fv - def update(self, feature_view_obj): + def update( + self, feature_view_obj: feature_view.FeatureView + ) -> feature_view.FeatureView: + """ + Update the feature view object saved in the backend + + # Arguments + feature_view_obj `FeatureView` : The feature view object to be saved. + + # Returns + `FeatureView` : Updated feature view that has the ID used to save in the backend. + """ self._feature_view_api.update(feature_view_obj) return feature_view_obj - def get(self, name, version=None): + def get( + self, name: str, version: int = None + ) -> Union[feature_view.FeatureView, List[feature_view.FeatureView]]: + """ + Get a feature view form the backend using name or using name and version. + + If version is not provided then a List of feature views containing all of its versions is returned. + + # Arguments + name `str`: Name of feature view. + version `version`: Version of the feature view. + + # Returns + `Union[FeatureView, List[FeatureView]]` + + # Raises + `RestAPIError`: If the feature view cannot be found from the backend. + `ValueError`: If the feature group associated with the feature view cannot be found. + """ if version: fv = self._feature_view_api.get_by_name_version(name, version) - self.attach_transformation_function(fv) else: fv = self._feature_view_api.get_by_name(name) - for _fv in fv: - self.attach_transformation_function(_fv) return fv - def attach_transformation_function(self, fv: "feature_view.FeatureView"): - fv.transformation_functions = ( - self._transformation_function_engine.get_fv_attached_transformation_fn( - fv.name, fv.version - ) - ) - if fv.transformation_functions: - for feature in fv.schema: - feature.transformation_function = fv.transformation_functions.get( - feature.name, None - ) - def delete(self, name, version=None): if version: return self._feature_view_api.delete_by_name_version(name, version) @@ -355,7 +372,12 @@ def get_training_data( spine=spine, ) split_df = engine.get_instance().get_training_data( - td_updated, feature_view_obj, query, read_options, dataframe_type + td_updated, + feature_view_obj, + query, + read_options, + dataframe_type, + training_dataset_version, ) self.compute_training_dataset_statistics( feature_view_obj, td_updated, split_df @@ -678,19 +700,17 @@ def _get_training_dataset_metadata( td = self._feature_view_api.get_training_dataset_by_version( feature_view_obj.name, feature_view_obj.version, training_dataset_version ) - # schema and transformation functions need to be set for writing training data or feature serving + # schema needs to be set for writing training data or feature serving td.schema = feature_view_obj.schema - td.transformation_functions = feature_view_obj.transformation_functions return td def _get_training_datasets_metadata(self, feature_view_obj): tds = self._feature_view_api.get_training_datasets( feature_view_obj.name, feature_view_obj.version ) - # schema and transformation functions need to be set for writing training data or feature serving + # schema needs to be set for writing training data or feature serving for td in tds: td.schema = feature_view_obj.schema - td.transformation_functions = feature_view_obj.transformation_functions return tds def get_training_datasets(self, feature_view_obj): diff --git a/python/hsfs/core/training_dataset_engine.py b/python/hsfs/core/training_dataset_engine.py index 8d47adf165..34907ce3ca 100644 --- a/python/hsfs/core/training_dataset_engine.py +++ b/python/hsfs/core/training_dataset_engine.py @@ -22,7 +22,6 @@ from hsfs.core import ( tags_api, training_dataset_api, - transformation_function_engine, ) @@ -38,11 +37,6 @@ def __init__(self, feature_store_id): feature_store_id ) self._tags_api = tags_api.TagsApi(feature_store_id, self.ENTITY_TYPE) - self._transformation_function_engine = ( - transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - ) def save(self, training_dataset, features, user_write_options): if isinstance(features, query.Query): @@ -53,9 +47,6 @@ def save(self, training_dataset, features, user_write_options): ) for label_name in training_dataset.label ] - self._transformation_function_engine.attach_transformation_fn( - training_dataset - ) else: features = engine.get_instance().convert_to_default_dataframe(features) training_dataset._features = ( @@ -66,19 +57,11 @@ def save(self, training_dataset, features, user_write_options): if feature.name == label_name: feature.label = True - # check if user provided transformation functions and throw error as transformation functions work only - # with query objects - if training_dataset.transformation_functions: - raise ValueError( - "Transformation functions can only be applied to training datasets generated from Query object" - ) - if len(training_dataset.splits) > 0 and training_dataset.train_split is None: training_dataset.train_split = "train" warnings.warn( "Training dataset splits were defined but no `train_split` (the name of the split that is going to be " - "used for training) was provided. Setting this property to `train`. The statistics of this " - "split will be used for transformation functions.", + "used for training) was provided. Setting this property to `train`. ", stacklevel=1, ) diff --git a/python/hsfs/core/transformation_function_api.py b/python/hsfs/core/transformation_function_api.py index a0f21f0097..f6692f8f62 100644 --- a/python/hsfs/core/transformation_function_api.py +++ b/python/hsfs/core/transformation_function_api.py @@ -19,9 +19,7 @@ from hsfs import ( client, - training_dataset, transformation_function, - transformation_function_attached, ) @@ -112,27 +110,3 @@ def delete( ] headers = {"content-type": "application/json"} _client._send_request("DELETE", path_params, headers=headers) - - def get_td_transformation_fn( - self, training_dataset_instance: training_dataset.TrainingDataset - ) -> transformation_function_attached.TransformationFunctionAttached: - """ - Retrieve TransformationFunctionAttached instance - Args: - training_dataset_instance: TrainingDataset, required - training dataset metadata object. - """ - _client = client.get_instance() - path_params = [ - "project", - _client._project_id, - "featurestores", - self._feature_store_id, - "trainingdatasets", - training_dataset_instance.id, - "transformationfunctions", - ] - - return transformation_function_attached.TransformationFunctionAttached.from_response_json( - _client._send_request("GET", path_params) - ) diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py index 4d1db1df04..4ab8c6a166 100644 --- a/python/hsfs/core/transformation_function_engine.py +++ b/python/hsfs/core/transformation_function_engine.py @@ -15,27 +15,12 @@ # from __future__ import annotations -import datetime -from functools import partial -from typing import Dict, Optional, Union - -import hsfs -import numpy -from hsfs import ( - feature_view, - statistics, - training_dataset, - training_dataset_feature, - transformation_function_attached, - util, -) -from hsfs.core import ( - feature_view_api, - statistics_api, - statistics_engine, - transformation_function_api, -) -from hsfs.core.builtin_transformation_function import BuiltInTransformationFunction +from typing import Dict, List, Optional, Set, TypeVar, Union + +import pandas as pd +import polars as pl +from hsfs import feature_view, statistics, training_dataset, transformation_function +from hsfs.core import transformation_function_api class TransformationFunctionEngine: @@ -53,36 +38,56 @@ class TransformationFunctionEngine: def __init__(self, feature_store_id: int): self._feature_store_id = feature_store_id - self._transformation_function_api = ( - transformation_function_api.TransformationFunctionApi(feature_store_id) - ) - self._statistics_api = statistics_api.StatisticsApi( - feature_store_id, training_dataset.TrainingDataset.ENTITY_TYPE + self._transformation_function_api: transformation_function_api.TransformationFunctionApi = transformation_function_api.TransformationFunctionApi( + feature_store_id ) - self._feature_view_api: Optional["feature_view_api.FeatureViewApi"] = None - self._statistics_engine: Optional["statistics_engine.StatisticsEngine"] = None - def save(self, transformation_fn_instance): - if self.is_builtin(transformation_fn_instance): - raise ValueError( - "Transformation function name '{name:}' with version 1 is reserved for built-in hsfs " - "functions. Please use other name or version".format( - name=transformation_fn_instance.name - ) - ) - if not callable(transformation_fn_instance.transformation_fn): - raise ValueError("transformer must be callable") + def save( + self, transformation_fn_instance: transformation_function.TransformationFunction + ) -> transformation_function.TransformationFunction: + """ + Save a transformation function into the feature store. + + # Argument + transformation_fn_instance `transformation_function.TransformationFunction`: The transformation function to be saved into the feature store. + """ self._transformation_function_api.register_transformation_fn( transformation_fn_instance ) - def get_transformation_fn(self, name, version=None): + def get_transformation_fn( + self, name: str, version: Optional[int] = None + ) -> Union[ + transformation_function.TransformationFunction, + List[transformation_function.TransformationFunction], + ]: + """ + Retrieve a transformation function from the feature store. + + If only the name of the transformation function is provided then all the versions of the transformation functions are returned as a list. + If both name and version are not provided then all transformation functions saved in the feature view is returned. + + # Argument + name ` Optional[str]`: The name of the transformation function to be retrieved. + version `Optional[int]`: The version of the transformation function to be retrieved. + # Returns + `Union[transformation_function.TransformationFunction, List[transformation_function.TransformationFunction]]` : A transformation function if name and version is provided. A list of transformation functions if only name is provided. + """ + transformation_fn_instances = ( self._transformation_function_api.get_transformation_fn(name, version) ) - return transformation_fn_instances[0] + return transformation_fn_instances + + def get_transformation_fns( + self, + ) -> List[transformation_function.TransformationFunction]: + """ + Get all the transformation functions in the feature store - def get_transformation_fns(self): + # Returns + `List[transformation_function.TransformationFunction]` : A list of transformation functions. + """ transformation_fn_instances = ( self._transformation_function_api.get_transformation_fn( name=None, version=None @@ -95,311 +100,202 @@ def get_transformation_fns(self): transformation_fns.append(transformation_fn_instance) return transformation_fns - def delete(self, transformation_function_instance): - self._transformation_function_api.delete(transformation_function_instance) - - def get_td_transformation_fn(self, training_dataset): - attached_transformation_fns = ( - self._transformation_function_api.get_td_transformation_fn(training_dataset) - ) - transformation_fn_dict = {} - for attached_transformation_fn in attached_transformation_fns: - transformation_fn_dict[attached_transformation_fn.name] = ( - attached_transformation_fn.transformation_function - ) - return transformation_fn_dict - - @staticmethod - def attach_transformation_fn(training_dataset_obj=None, feature_view_obj=None): - if training_dataset_obj: - target_obj = training_dataset_obj # todo why provide td and fv just to convert to target_obj? - else: - target_obj = feature_view_obj - - if target_obj._transformation_functions: - for ( - feature_name, - transformation_fn, - ) in target_obj._transformation_functions.items(): - if feature_name in target_obj.labels: - raise ValueError( - "Online transformations for training dataset labels are not supported." - ) - - feature, prefix, featuregroup = target_obj.query._get_feature_by_name( - feature_name - ) - target_obj._features.append( - training_dataset_feature.TrainingDatasetFeature( - name=feature_name, - feature_group_feature_name=feature.name, - featuregroup=featuregroup, - type=transformation_fn.output_type, - label=False, - transformation_function=transformation_fn, - ) - ) - - def is_builtin(self, transformation_fn_instance): - return ( - transformation_fn_instance.name in self.BUILTIN_FN_NAMES - and transformation_fn_instance.version == 1 - ) - - @staticmethod - def populate_builtin_fn_arguments( - feature_name, transformation_function_instance, feature_descriptive_stats - ): - if transformation_function_instance.name == "min_max_scaler": - min_value, max_value = BuiltInTransformationFunction.min_max_scaler_stats( - feature_descriptive_stats, feature_name - ) - transformation_function_instance.transformation_fn = partial( - transformation_function_instance.transformation_fn, - min_value=min_value, - max_value=max_value, - ) - elif transformation_function_instance.name == "standard_scaler": - mean, std_dev = BuiltInTransformationFunction.standard_scaler_stats( - feature_descriptive_stats, feature_name - ) - transformation_function_instance.transformation_fn = partial( - transformation_function_instance.transformation_fn, - mean=mean, - std_dev=std_dev, - ) - elif transformation_function_instance.name == "robust_scaler": - robust_scaler_stats = BuiltInTransformationFunction.robust_scaler_stats( - feature_descriptive_stats, feature_name - ) - transformation_function_instance.transformation_fn = partial( - transformation_function_instance.transformation_fn, - p25=robust_scaler_stats[24], - p50=robust_scaler_stats[49], - p75=robust_scaler_stats[74], - ) - elif transformation_function_instance.name == "label_encoder": - value_to_index = BuiltInTransformationFunction.encoder_stats( - feature_descriptive_stats, feature_name - ) - transformation_function_instance.transformation_fn = partial( - transformation_function_instance.transformation_fn, - value_to_index=value_to_index, - ) - else: - raise ValueError("Not implemented") - - return transformation_function_instance - - def populate_builtin_attached_fns( - self, attached_transformation_fns, feature_descriptive_stats - ): - for ft_name in attached_transformation_fns: - if self.is_builtin(attached_transformation_fns[ft_name]): - # check if its built-in transformation function and populated with statistics arguments - transformation_fn = self.populate_builtin_fn_arguments( - ft_name, - attached_transformation_fns[ft_name], - feature_descriptive_stats, - ) - attached_transformation_fns[ft_name] = transformation_fn - return attached_transformation_fns + def delete( + self, + transformation_function_instance: transformation_function.TransformationFunction, + ) -> None: + """ + Delete a transformation function from the feature store. - @staticmethod - def infer_spark_type(output_type): - if not output_type: - return "STRING" # STRING is default type for spark udfs - - if isinstance(output_type, str): - if output_type.endswith("Type()"): - return util.translate_legacy_spark_type(output_type) - output_type = output_type.lower() - - if output_type in (str, "str", "string"): - return "STRING" - elif output_type in (bytes, "binary"): - return "BINARY" - elif output_type in (numpy.int8, "int8", "byte", "tinyint"): - return "BYTE" - elif output_type in (numpy.int16, "int16", "short", "smallint"): - return "SHORT" - elif output_type in (int, "int", "integer", numpy.int32): - return "INT" - elif output_type in (numpy.int64, "int64", "long", "bigint"): - return "LONG" - elif output_type in (float, "float"): - return "FLOAT" - elif output_type in (numpy.float64, "float64", "double"): - return "DOUBLE" - elif output_type in ( - datetime.datetime, - numpy.datetime64, - "datetime", - "timestamp", - ): - return "TIMESTAMP" - elif output_type in (datetime.date, "date"): - return "DATE" - elif output_type in (bool, "boolean", "bool"): - return "BOOLEAN" - else: - raise TypeError("Not supported type %s." % output_type) + # Arguments + transformation_function_instance `transformation_function.TransformationFunction`: The transformation function to be removed from the feature store. + """ + self._transformation_function_api.delete(transformation_function_instance) @staticmethod def compute_transformation_fn_statistics( - training_dataset_obj, - builtin_tffn_features, - label_encoder_features, - feature_dataframe, - feature_view_obj, + training_dataset_obj: training_dataset.TrainingDataset, + statistics_features: List[str], + label_encoder_features: List[str], + feature_dataframe: Union[ + pd.DataFrame, pl.DataFrame, TypeVar("pyspark.sql.DataFrame") + ], + feature_view_obj: feature_view.FeatureView, ) -> statistics.Statistics: + """ + Compute the statistics required for a training dataset object. + + # Arguments + training_dataset_obj `TrainingDataset`: The training dataset for which the statistics is to be computed. + statistics_features `List[str]`: The list of features for which the statistics should be computed. + label_encoder_features `List[str]`: Features used for label encoding. + feature_dataframe `Union[pd.DataFrame, pl.DataFrame, ps.DataFrame]`: The dataframe that contains the data for which the statistics must be computed. + feature_view_obj `FeatureView`: The feature view in which the training data is being created. + # Returns + `Statistics` : The statistics object that contains the statistics for each features. + """ return training_dataset_obj._statistics_engine.compute_transformation_fn_statistics( td_metadata_instance=training_dataset_obj, - columns=builtin_tffn_features, # excluding label encoded features + columns=statistics_features, label_encoder_features=label_encoder_features, # label encoded features only feature_dataframe=feature_dataframe, feature_view_obj=feature_view_obj, ) @staticmethod - def populate_builtin_transformation_functions( - training_dataset, feature_view_obj, dataset - ): - # check if there any transformation functions that require statistics attached to td features - builtin_tffn_label_encoder_features = [ - ft_name - for ft_name in training_dataset.transformation_functions - if training_dataset._transformation_function_engine.is_builtin( - training_dataset.transformation_functions[ft_name] + def get_ready_to_use_transformation_fns( + feature_view: feature_view.FeatureView, + training_dataset_version: Optional[int] = None, + ) -> List[transformation_function.TransformationFunction]: + """ + Function that updates statistics required for all transformation functions in the feature view based on training dataset version. + + # Arguments + feature_view `FeatureView`: The feature view in which the training data is being created. + training_dataset_version `TrainingDataset`: The training version used to update the statistics used in the transformation functions. + # Returns + `List[transformation_function.TransformationFunction]` : List of transformation functions. + """ + # check if transformation functions require statistics + is_stat_required = any( + [ + tf.hopsworks_udf.statistics_required + for tf in feature_view.transformation_functions + ] + ) + if not is_stat_required: + td_tffn_stats = None + else: + # if there are any transformation functions that require statistics get related statistics and + # populate with relevant arguments + # there should be only one statistics object with before_transformation=true + if training_dataset_version is None: + raise ValueError( + "Training data version is required for transformation. Call `feature_view.init_serving(version)` " + "or `feature_view.init_batch_scoring(version)` to pass the training dataset version." + "Training data can be created by `feature_view.create_training_data` or `feature_view.training_data`." + ) + td_tffn_stats = feature_view._statistics_engine.get( + feature_view, + before_transformation=True, + training_dataset_version=training_dataset_version, ) - and training_dataset.transformation_functions[ft_name].name - == "label_encoder" - ] - builtin_tffn_features = [ - ft_name - for ft_name in training_dataset.transformation_functions - if training_dataset._transformation_function_engine.is_builtin( - training_dataset.transformation_functions[ft_name] + + if is_stat_required and td_tffn_stats is None: + raise ValueError( + "No statistics available for initializing transformation functions." + + "Training data can be created by `feature_view.create_training_data` or `feature_view.training_data`." ) - and training_dataset.transformation_functions[ft_name].name - != "label_encoder" - ] - if builtin_tffn_features or builtin_tffn_label_encoder_features: + if is_stat_required: + for transformation_function in feature_view.transformation_functions: + transformation_function.hopsworks_udf.transformation_statistics = ( + td_tffn_stats.feature_descriptive_statistics + ) + return feature_view.transformation_functions + + @staticmethod + def compute_and_set_feature_statistics( + training_dataset: training_dataset.TrainingDataset, + feature_view_obj: feature_view.FeatureView, + dataset: Union[ + Dict[ + str, Union[pd.DataFrame, pl.DataFrame, TypeVar("pyspark.sql.DataFrame")] + ], + Union[pd.DataFrame, pl.DataFrame, TypeVar("pyspark.sql.DataFrame")], + ], + ) -> None: + """ + Function that computes and sets the statistics required for the UDF used for transformation. + + The function assigns the statistics computed to hopsworks UDF object so that the statistics can be used when UDF is executed. + + # Argument + training_dataset_obj `TrainingDataset`: The training dataset for which the statistics is to be computed. + feature_view `FeatureView`: The feature view in which the training data is being created. + dataset `Union[Dict[str, Union[pd.DataFrame, pl.DataFrame, ps.DataFrame]], Union[pd.DataFrame, pl.DataFrame, ps.DataFrame]]`: A dataframe that conqtains the training data or a dictionary that contains both the training and test data. + """ + statistics_features: Set[str] = set() + label_encoder_features: Set[str] = set() + + # Finding the features for which statistics is required + for tf in feature_view_obj.transformation_functions: + statistics_features.update(tf.hopsworks_udf.statistics_features) + if ( + tf.hopsworks_udf.function_name == "label_encoder" + or tf.hopsworks_udf.function_name == "one_hot_encoder" + ): + label_encoder_features.update(tf.hopsworks_udf.statistics_features) + if statistics_features: + # compute statistics on training data if training_dataset.splits: # compute statistics before transformations are applied stats = ( TransformationFunctionEngine.compute_transformation_fn_statistics( training_dataset, - builtin_tffn_features, - builtin_tffn_label_encoder_features, + list(statistics_features), + list(label_encoder_features), dataset.get(training_dataset.train_split), feature_view_obj, ) ) else: - # compute statistics before transformations are applied stats = ( TransformationFunctionEngine.compute_transformation_fn_statistics( training_dataset, - builtin_tffn_features, - builtin_tffn_label_encoder_features, + list(statistics_features), + list(label_encoder_features), dataset, feature_view_obj, ) ) - # Populate builtin transformations (if any) with respective arguments - return training_dataset._transformation_function_engine.populate_builtin_attached_fns( - training_dataset.transformation_functions, - stats.feature_descriptive_statistics, - ) - def get_ready_to_use_transformation_fns( - self, - entity: Union[hsfs.feature_view.FeatureView, training_dataset.TrainingDataset], - training_dataset_version: Optional[int] = None, - ) -> Dict[ - str, hsfs.transformation_function_attached.TransformationFunctionAttached - ]: - is_feat_view = isinstance(entity, feature_view.FeatureView) - if self._feature_view_api is None: - self._feature_view_api = feature_view_api.FeatureViewApi( - self._feature_store_id - ) - if self._statistics_engine is None: - self._statistics_engine = statistics_engine.StatisticsEngine( - self._feature_store_id, - entity_type="featureview" if is_feat_view else "trainingdataset", - ) - # get attached transformation functions - transformation_functions = ( - self.get_td_transformation_fn(entity) - if isinstance(entity, training_dataset.TrainingDataset) - else (self.get_fv_attached_transformation_fn(entity.name, entity.version)) - ) - is_stat_required = ( - len( - set(self.BUILTIN_FN_NAMES).intersection( - set([tf.name for tf in transformation_functions.values()]) + # Set statistics computed in the hopsworks UDF + for tf in feature_view_obj.transformation_functions: + tf.hopsworks_udf.transformation_statistics = ( + stats.feature_descriptive_statistics ) - ) - > 0 + + @staticmethod + def get_and_set_feature_statistics( + training_dataset: training_dataset.TrainingDataset, + feature_view_obj: feature_view.FeatureView, + training_dataset_version: int = None, + ) -> None: + """ + Function that gets the transformation statistics computed while creating the training dataset from the backend and assigns it to the hopsworks UDF object. + + The function assigns the statistics computed to hopsworks UDF object so that the statistics can be used when UDF is executed. + + # Argument + training_dataset_obj `TrainingDataset`: The training dataset for which the statistics is to be computed. + feature_view `FeatureView`: The feature view in which the training data is being created. + training_dataset_version `int`: The version of the training dataset for which the statistics is to be retrieved. + + # Raises + `ValueError` : If the statistics are not present in the backend. + """ + + is_stat_required = any( + [ + tf.hopsworks_udf.statistics_required + for tf in feature_view_obj.transformation_functions + ] ) - if not is_stat_required: - td_tffn_stats = None - else: - # if there are any built-in transformation functions get related statistics and - # populate with relevant arguments - # there should be only one statistics object with before_transformation=true - if is_feat_view and training_dataset_version is None: - raise ValueError( - "Training data version is required for transformation. Call `feature_view.init_serving(version)` " - "or `feature_view.init_batch_scoring(version)` to pass the training dataset version." - "Training data can be created by `feature_view.create_training_data` or `feature_view.training_data`." - ) - td_tffn_stats = self._statistics_engine.get( - entity, + + if is_stat_required: + td_tffn_stats = training_dataset._statistics_engine.get( + feature_view_obj, before_transformation=True, training_dataset_version=training_dataset_version, ) - if is_stat_required and td_tffn_stats is None: - raise ValueError( - "No statistics available for initializing transformation functions." - + "Training data can be created by `feature_view.create_training_data` or `feature_view.training_data`." - ) - - transformation_fns = self.populate_builtin_attached_fns( - transformation_functions, - td_tffn_stats.feature_descriptive_statistics - if td_tffn_stats is not None - else None, - ) - return transformation_fns + if td_tffn_stats is None: + raise ValueError( + "No statistics available for initializing transformation functions." + ) - def get_fv_attached_transformation_fn( - self, fv_name: str, fv_version: int - ) -> Dict[str, "transformation_function_attached.TransformationFunctionAttached"]: - if self._feature_view_api is None: - self._feature_view_api = feature_view_api.FeatureViewApi( - self._feature_store_id - ) - self._statistics_engine = statistics_engine.StatisticsEngine( - self._feature_store_id, - entity_type="featureview", - ) - transformation_functions = ( - self._feature_view_api.get_attached_transformation_fn(fv_name, fv_version) - ) - if isinstance(transformation_functions, list): - transformation_functions_dict = dict( - [ - (tf.name, tf.transformation_function) - for tf in transformation_functions - ] - ) - else: - transformation_functions_dict = { - transformation_functions.name: transformation_functions.transformation_function - } - return transformation_functions_dict + for tf in feature_view_obj.transformation_functions: + tf.hopsworks_udf.transformation_statistics = ( + td_tffn_stats.feature_descriptive_statistics + ) diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py index ae35f326b8..403cbb2522 100755 --- a/python/hsfs/core/vector_server.py +++ b/python/hsfs/core/vector_server.py @@ -31,16 +31,12 @@ client, feature_view, training_dataset, + transformation_function, ) from hsfs import ( serving_key as sk_mod, ) -from hsfs import ( - training_dataset_feature as tdf_mod, -) -from hsfs import ( - transformation_function_attached as tfa_mod, -) +from hsfs import training_dataset_feature as tdf_mod from hsfs.client import exceptions, online_store_rest_client from hsfs.core import ( online_store_rest_client_engine, @@ -103,7 +99,7 @@ def __init__( self._inference_helper_col_name = [ feat.name for feat in features if feat.inference_helper_column ] - + self._transformed_feature_vector_col_name: List[str] = None self._skip_fg_ids = skip_fg_ids or set() self._serving_keys = serving_keys or [] self._required_serving_keys = [] @@ -111,9 +107,12 @@ def __init__( self._transformation_function_engine = ( tf_engine_mod.TransformationFunctionEngine(feature_store_id) ) - self._transformation_functions: Dict[ - str, tfa_mod.TransformationFunctionAttached - ] = {} + self._model_dependent_transformation_functions: List[ + transformation_function.TransformationFunction + ] = [] + self._on_demand_transformation_functions: List[ + transformation_function.TransformationFunction + ] = [] self._sql_client = None self._rest_client_engine = None @@ -125,7 +124,7 @@ def __init__( def init_serving( self, - entity: Union[feature_view.FeatureView, training_dataset.TrainingDataset], + entity: Union[feature_view.FeatureView], external: Optional[bool] = None, inference_helper_columns: bool = False, options: Optional[Dict[str, Any]] = None, @@ -187,15 +186,23 @@ def init_batch_scoring( def init_transformation( self, - entity: Union[feature_view.FeatureView, training_dataset.TrainingDataset], + entity: feature_view.FeatureView, ): # attach transformation functions - self._transformation_functions = ( - self.transformation_function_engine.get_ready_to_use_transformation_fns( - entity, - self._training_dataset_version, - ) + self._model_dependent_transformation_functions = tf_engine_mod.TransformationFunctionEngine.get_ready_to_use_transformation_fns( + entity, + self._training_dataset_version, ) + self._on_demand_transformation_functions = [ + feature.on_demand_transformation_function + for feature in entity.features + if feature.on_demand_transformation_function + ] + self._on_demand_feature_names = [ + feature.name + for feature in entity.features + if feature.on_demand_transformation_function + ] def setup_sql_client( self, @@ -248,6 +255,7 @@ def get_feature_vector( allow_missing: bool = False, force_rest_client: bool = False, force_sql_client: bool = False, + request_parameters: Optional[Dict[str, Any]] = None, ) -> Union[pd.DataFrame, pl.DataFrame, np.ndarray, List[Any], Dict[str, Any]]: """Assembles serving vector from online feature store.""" online_client_choice = self.which_client_and_ensure_initialised( @@ -279,8 +287,8 @@ def get_feature_vector( vector_db_result=vector_db_features or {}, allow_missing=allow_missing, client=online_client_choice, + request_parameters=request_parameters, ) - return self.handle_feature_vector_return_type( vector, batch=False, inference_helper=False, return_type=return_type ) @@ -293,6 +301,7 @@ def get_feature_vectors( ] = None, passed_features: Optional[List[Dict[str, Any]]] = None, vector_db_features: Optional[List[Dict[str, Any]]] = None, + request_parameters: Optional[List[Dict[str, Any]]] = None, allow_missing: bool = False, force_rest_client: bool = False, force_sql_client: bool = False, @@ -300,7 +309,6 @@ def get_feature_vectors( """Assembles serving vector from online feature store.""" if passed_features is None: passed_features = [] - # Assertions on passed_features and vector_db_features assert ( passed_features is None @@ -312,6 +320,12 @@ def get_feature_vectors( or len(vector_db_features) == 0 or len(vector_db_features) == len(entries) ), "Vector DB features should be None, empty or have the same length as the entries" + assert ( + request_parameters is None + or len(request_parameters) == 0 + or isinstance(request_parameters, dict) + or len(request_parameters) == len(entries) + ), "Request Parameters should be a Dictionary, None, empty or have the same length as the entries" online_client_choice = self.which_client_and_ensure_initialised( force_rest_client=force_rest_client, force_sql_client=force_sql_client @@ -354,14 +368,23 @@ def get_feature_vectors( skipped_empty_entries.pop(0) if len(skipped_empty_entries) > 0 else None ) vectors = [] + + # If request parameter is a dictionary then copy it to list with the same length as that of entires + request_parameters = ( + [request_parameters] * len(entries) + if isinstance(request_parameters, dict) + else request_parameters + ) for ( idx, passed_values, vector_db_result, + request_parameter, ) in itertools.zip_longest( range(len(entries)), passed_features or [], vector_db_features or [], + request_parameters or [], fillvalue=None, ): if next_skipped == idx: @@ -381,6 +404,7 @@ def get_feature_vectors( vector_db_result=vector_db_result, allow_missing=allow_missing, client=online_client_choice, + request_parameters=request_parameter, ) if vector is not None: @@ -397,6 +421,7 @@ def assemble_feature_vector( vector_db_result: Optional[Dict[str, Any]], allow_missing: bool, client: Literal["rest", "sql"], + request_parameters: Optional[Dict[str, Any]] = None, ) -> Optional[List[Any]]: """Assembles serving vector from online feature store.""" # Errors in batch requests are returned as None values @@ -411,9 +436,52 @@ def assemble_feature_vector( _logger.debug("Updating with passed features: %s", passed_values) result_dict.update(passed_values) - missing_features = set(self.feature_vector_col_name).difference( - result_dict.keys() + missing_features = ( + set(self.feature_vector_col_name) + .difference(result_dict.keys()) + .difference(self._on_demand_feature_names) ) + + # TODO : Optimize this + request_parameters = {} if not request_parameters else request_parameters + available_parameters = set((result_dict | request_parameters).keys()) + missing_request_parameters_features = {} + + for on_demand_feature, on_demand_transformation in zip( + self._on_demand_feature_names, self._on_demand_transformation_functions + ): + missing_request_parameter = ( + set(on_demand_transformation.hopsworks_udf.transformation_features) + - available_parameters + ) + if missing_request_parameter: + missing_request_parameters_features[on_demand_feature] = sorted( + list( + set( + on_demand_transformation.hopsworks_udf.transformation_features + ) + - available_parameters + ) + ) + + if missing_request_parameters_features: + error = "Missing Request parameters to compute the following the on-demand Features:\n" + for ( + feature, + missing_request_parameter, + ) in missing_request_parameters_features.items(): + missing_request_parameter = "', '".join(missing_request_parameter) + error += f"On-Demand Feature '{feature}' requires features '{missing_request_parameter}'\n" + error += ( + "Possible reasons: " + "1. There is no match in the given entry." + " Please check if the entry exists in the online feature store" + " or provide the feature as passed_feature. " + f"2. Required entries [{', '.join(self.required_serving_keys)}] or " + f"[{', '.join(set(sk.feature_name for sk in self._serving_keys))}] are not provided." + ) + raise exceptions.FeatureStoreException(error) + # for backward compatibility, before 3.4, if result is empty, # instead of throwing error, it skips the result # Maybe we drop this behaviour for 4.0 @@ -433,12 +501,18 @@ def assemble_feature_vector( if len(self.return_feature_value_handlers) > 0: self.apply_return_value_handlers(result_dict, client=client) - if len(self.transformation_functions) > 0: - self.apply_transformation(result_dict) + if ( + len(self.model_dependent_transformation_functions) > 0 + or len(self.on_demand_transformation_functions) > 0 + ): + self.apply_transformation(result_dict, request_parameters) _logger.debug("Assembled and transformed dict feature vector: %s", result_dict) - return [result_dict.get(fname, None) for fname in self.feature_vector_col_name] + return [ + result_dict.get(fname, None) + for fname in self.transformed_feature_vector_col_name + ] def handle_feature_vector_return_type( self, @@ -477,17 +551,19 @@ def handle_feature_vector_return_type( return pd.DataFrame([feature_vectorz]) elif batch: return pd.DataFrame( - feature_vectorz, columns=self._feature_vector_col_name + feature_vectorz, columns=self.transformed_feature_vector_col_name ) else: pandas_df = pd.DataFrame(feature_vectorz).transpose() - pandas_df.columns = self._feature_vector_col_name + pandas_df.columns = self.transformed_feature_vector_col_name return pandas_df elif return_type.lower() == "polars": _logger.debug("Returning feature vector as polars dataframe") return pl.DataFrame( feature_vectorz if batch else [feature_vectorz], - schema=self._feature_vector_col_name if not inference_helper else None, + schema=self.transformed_feature_vector_col_name + if not inference_helper + else None, orient="row", ) else: @@ -634,15 +710,36 @@ def _set_default_client( self.default_client = self.DEFAULT_SQL_CLIENT self._init_sql_client = True - def apply_transformation(self, row_dict: Dict[str, Any]): - matching_keys = set(self.transformation_functions.keys()).intersection( - row_dict.keys() - ) - _logger.debug("Applying transformation functions to : %s", matching_keys) - for feature_name in matching_keys: - row_dict[feature_name] = self.transformation_functions[ - feature_name - ].transformation_fn(row_dict[feature_name]) + def apply_transformation(self, row_dict: dict, request_parameter: Dict[str, Any]): + _logger.debug("Applying On-Demand transformation functions.") + for tf in self._on_demand_transformation_functions: + # Check if feature provided as request parameter if not get it from retrieved feature vector. + features = [ + pd.Series(request_parameter[feature]) + if feature in request_parameter.keys() + else pd.Series(row_dict[feature]) + for feature in tf.hopsworks_udf.transformation_features + ] + on_demand_feature = tf.hopsworks_udf.get_udf(force_python_udf=True)( + *features + ) # Get only python compatible UDF irrespective of engine + + row_dict[on_demand_feature.name] = on_demand_feature.values[0] + + _logger.debug("Applying Model-Dependent transformation functions.") + for tf in self.model_dependent_transformation_functions: + features = [ + pd.Series(row_dict[feature]) + for feature in tf.hopsworks_udf.transformation_features + ] + transformed_result = tf.hopsworks_udf.get_udf(force_python_udf=True)( + *features + ) # Get only python compatible UDF irrespective of engine + if isinstance(transformed_result, pd.Series): + row_dict[transformed_result.name] = transformed_result.values[0] + else: + for col in transformed_result: + row_dict[col] = transformed_result[col].values[0] return row_dict def apply_return_value_handlers( @@ -678,6 +775,7 @@ def build_complex_feature_decoders(self) -> Dict[str, Callable]: for f in self._features if f.is_complex() } + if len(complex_feature_schemas) == 0: return {} else: @@ -869,7 +967,6 @@ def identify_missing_features_pre_fetch( passed_feature_names = passed_feature_names.union( vector_db_features.keys() ) - neither_fetched_nor_passed = fetched_features.difference( passed_feature_names ) @@ -993,12 +1090,16 @@ def per_serving_key_features(self) -> Dict[str, set[str]]: return self._per_serving_key_features @property - def transformation_functions( + def model_dependent_transformation_functions( + self, + ) -> Optional[List[transformation_function.TransformationFunction]]: + return self._model_dependent_transformation_functions + + @property + def on_demand_transformation_functions( self, - ) -> Dict[str, tfa_mod.TransformationFunctionAttached]: - if self._transformation_functions is None: - self._transformation_functions = {} - return self._transformation_functions + ) -> Optional[List[transformation_function.TransformationFunction]]: + return self._on_demand_transformation_functions @property def return_feature_value_handlers(self) -> Dict[str, Callable]: @@ -1064,3 +1165,26 @@ def default_client(self, default_client: Literal["rest", "sql"]): _logger.debug(f"Default Online Store Client is set to {default_client}.") self._default_client = default_client + + @property + def transformed_feature_vector_col_name(self): + if self._transformed_feature_vector_col_name is None: + transformation_features = [] + output_column_names = [] + for ( + transformation_function + ) in self._model_dependent_transformation_functions: + transformation_features += ( + transformation_function.hopsworks_udf.transformation_features + ) + output_column_names += ( + transformation_function.hopsworks_udf.output_column_names + ) + + self._transformed_feature_vector_col_name = [ + feature + for feature in self._feature_vector_col_name + if feature not in transformation_features + ] + self._transformed_feature_vector_col_name.extend(output_column_names) + return self._transformed_feature_vector_col_name diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py index 8e64e6ec95..b0efd7be0e 100644 --- a/python/hsfs/engine/python.py +++ b/python/hsfs/engine/python.py @@ -30,7 +30,16 @@ from datetime import datetime, timezone from io import BytesIO from pathlib import Path -from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union +from typing import ( + Any, + Callable, + Dict, + List, + Literal, + Optional, + Tuple, + Union, +) import avro import boto3 @@ -48,7 +57,7 @@ feature, feature_store, feature_view, - transformation_function_attached, + transformation_function, util, ) from hsfs import storage_connector as sc @@ -93,6 +102,18 @@ except ImportError: pass +PYARROW_EXTENSION_ENABLE = False +try: + import pandas as pd + from packaging.version import Version + + if Version(pd.__version__) > Version("2.0"): + PYARROW_EXTENSION_ENABLE = True + else: + PYARROW_EXTENSION_ENABLE = False +except Exception: + PYARROW_EXTENSION_ENABLE = False # Set PYARROW_EXTENSION_ENABLE to false if pyarrow or pandas cannot be imported + # Decimal types are currently not supported _INT_TYPES = [pa.uint8(), pa.uint16(), pa.int8(), pa.int16(), pa.int32()] _BIG_INT_TYPES = [pa.uint32(), pa.int64()] @@ -206,7 +227,6 @@ def _sql_offline( hive_config: Optional[Dict[str, Any]] = None, arrow_flight_config: Optional[Dict[str, Any]] = None, ) -> Union[pd.DataFrame, pl.DataFrame]: - self._validate_dataframe_type(dataframe_type) if isinstance(sql_query, dict) and "query_string" in sql_query: result_df = util.run_with_loading_animation( @@ -510,7 +530,12 @@ def show( sql_query, feature_store, online_conn, "default", read_options or {} ).head(n) - def read_vector_db(self, feature_group: "hsfs.feature_group.FeatureGroup", n: int =None, dataframe_type: str="default") -> Union[pd.DataFrame, pl.DataFrame, np.ndarray, List[List[Any]]]: + def read_vector_db( + self, + feature_group: "hsfs.feature_group.FeatureGroup", + n: int = None, + dataframe_type: str = "default", + ) -> Union[pd.DataFrame, pl.DataFrame, np.ndarray, List[List[Any]]]: dataframe_type = dataframe_type.lower() self._validate_dataframe_type(dataframe_type) @@ -779,6 +804,9 @@ def parse_schema_feature_group( self, dataframe: Union[pd.DataFrame, pl.DataFrame], time_travel_format: Optional[str] = None, + transformation_functions: Optional[ + List[transformation_function.TransformationFunction] + ] = None, ) -> List[feature.Feature]: if isinstance(dataframe, pd.DataFrame): arrow_schema = pa.Schema.from_pandas(dataframe, preserve_index=False) @@ -787,6 +815,20 @@ def parse_schema_feature_group( ): arrow_schema = dataframe.to_arrow().schema features = [] + transformed_features = [] + dropped_features = [] + + if transformation_functions: + for tf in transformation_functions: + transformed_features.append( + feature.Feature( + tf.hopsworks_udf.output_column_names[0], + tf.hopsworks_udf.return_types[0], + on_demand=True, + ) + ) + if tf.hopsworks_udf.dropped_features: + dropped_features.extend(tf.hopsworks_udf.dropped_features) for feat_name in arrow_schema.names: name = util.autofix_feature_name(feat_name) try: @@ -795,8 +837,10 @@ def parse_schema_feature_group( ) except ValueError as e: raise FeatureStoreException(f"Feature '{name}': {str(e)}") from e - features.append(feature.Feature(name, converted_type)) - return features + if name not in dropped_features: + features.append(feature.Feature(name, converted_type)) + + return features + transformed_features def parse_schema_training_dataset( self, dataframe: Union[pd.DataFrame, pl.DataFrame] @@ -817,6 +861,11 @@ def save_dataframe( online_write_options: Dict[str, Any], validation_id: Optional[int] = None, ) -> Optional[job.Job]: + if feature_group.transformation_functions: + dataframe = self._apply_transformation_function( + feature_group.transformation_functions, dataframe + ) + if ( isinstance(feature_group, ExternalFeatureGroup) and feature_group.online_enabled @@ -875,7 +924,22 @@ def get_training_data( query_obj: query.Query, read_options: Dict[str, Any], dataframe_type: str, + training_dataset_version: int = None, ) -> Union[pd.DataFrame, pl.DataFrame]: + """ + Function that creates or retrieves already created the training dataset. + + # Arguments + training_dataset_obj `TrainingDataset`: The training dataset metadata object. + feature_view_obj `FeatureView`: The feature view object for the which the training data is being created. + query_obj `Query`: The query object that contains the query used to create the feature view. + read_options `Dict[str, Any]`: Dictionary that can be used to specify extra parameters for reading data. + dataframe_type `str`: The type of dataframe returned. + training_dataset_version `int`: Version of training data to be retrieved. + # Raises + `ValueError`: If the training dataset statistics could not be retrieved. + """ + # dataframe_type of list and numpy are prevented here because statistics needs to be computed from the returned dataframe. # The daframe is converted into required types in the function split_labels if dataframe_type.lower() not in ["default", "polars", "pandas"]: @@ -888,16 +952,22 @@ def get_training_data( feature_view_obj, read_options, dataframe_type, + training_dataset_version, ) else: df = query_obj.read( read_options=read_options, dataframe_type=dataframe_type ) - transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions( + # if training_dataset_version is None: + transformation_function_engine.TransformationFunctionEngine.compute_and_set_feature_statistics( training_dataset_obj, feature_view_obj, df ) + # else: + # transformation_function_engine.TransformationFunctionEngine.get_and_set_feature_statistics( + # training_dataset_obj, feature_view_obj, training_dataset_version + # ) return self._apply_transformation_function( - training_dataset_obj.transformation_functions, df + feature_view_obj.transformation_functions, df ) def split_labels( @@ -930,10 +1000,21 @@ def _prepare_transform_split_df( feature_view_obj: feature_view.FeatureView, read_option: Dict[str, Any], dataframe_type: str, + training_dataset_version: int = None, ) -> Dict[str, Union[pd.DataFrame, pl.DataFrame]]: """ Split a df into slices defined by `splits`. `splits` is a `dict(str, int)` which keys are name of split and values are split ratios. + + # Arguments + query_obj `Query`: The query object that contains the query used to create the feature view. + training_dataset_obj `TrainingDataset`: The training dataset metadata object. + feature_view_obj `FeatureView`: The feature view object for the which the training data is being created. + read_options `Dict[str, Any]`: Dictionary that can be used to specify extra parameters for reading data. + dataframe_type `str`: The type of dataframe returned. + training_dataset_version `int`: Version of training data to be retrieved. + # Raises + `ValueError`: If the training dataset statistics could not be retrieved. """ if ( training_dataset_obj.splits[0].split_type @@ -966,15 +1047,19 @@ def _prepare_transform_split_df( training_dataset_obj, ) - # apply transformations - # 1st parametrise transformation functions with dt split stats - transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions( + # TODO : Currently statistics always computed since in memory training dataset retrieved is not consistent + # if training_dataset_version is None: + transformation_function_engine.TransformationFunctionEngine.compute_and_set_feature_statistics( training_dataset_obj, feature_view_obj, result_dfs ) + # else: + # transformation_function_engine.TransformationFunctionEngine.get_and_set_feature_statistics( + # training_dataset_obj, feature_view_obj, training_dataset_version + # ) # and the apply them for split_name in result_dfs: result_dfs[split_name] = self._apply_transformation_function( - training_dataset_obj.transformation_functions, + feature_view_obj.transformation_functions, result_dfs.get(split_name), ) @@ -1149,8 +1234,24 @@ def _create_hive_connection( def _return_dataframe_type( self, dataframe: Union[pd.DataFrame, pl.DataFrame], dataframe_type: str ) -> Union[pd.DataFrame, pl.DataFrame, np.ndarray, List[List[Any]]]: - if dataframe_type.lower() in ["default", "pandas", "polars"]: + """ + Returns a dataframe of particular type. + + # Arguments + dataframe `Union[pd.DataFrame, pl.DataFrame]`: Input dataframe + dataframe_type `str`: Type of dataframe to be returned + # Returns + `Union[pd.DataFrame, pl.DataFrame, np.array, list]`: DataFrame of required type. + """ + if dataframe_type.lower() in ["default", "pandas"]: return dataframe + if dataframe_type.lower() == "polars": + if not ( + isinstance(dataframe, pl.DataFrame) or isinstance(dataframe, pl.Series) + ): + return pl.from_pandas(dataframe) + else: + return dataframe if dataframe_type.lower() == "numpy": return dataframe.values if dataframe_type.lower() == "python": @@ -1228,39 +1329,59 @@ def add_file(self, file: Optional[str]) -> Optional[str]: def _apply_transformation_function( self, - transformation_functions: Dict[ - str, transformation_function_attached.TransformationFunctionAttached - ], + transformation_functions: List[transformation_function.TransformationFunction], dataset: Union[pd.DataFrame, pl.DataFrame], ) -> Union[pd.DataFrame, pl.DataFrame]: - for ( - feature_name, - transformation_fn, - ) in transformation_functions.items(): - if isinstance(dataset, pl.DataFrame) or isinstance( - dataset, pl.dataframe.frame.DataFrame - ): - dataset = dataset.with_columns( - pl.col(feature_name).map_elements( - transformation_fn.transformation_fn - ) - ) + """ + Apply transformation function to the dataframe. + + # Arguments + transformation_functions `List[transformation_function.TransformationFunction]` : List of transformation functions. + dataset `Union[pd.DataFrame, pl.DataFrame]`: A pandas or polars dataframe. + # Returns + `DataFrame`: A pandas dataframe with the transformed data. + # Raises + `FeatureStoreException`: If any of the features mentioned in the transformation function is not present in the Feature View. + """ + dropped_features = set() + + if isinstance(dataset, pl.DataFrame) or isinstance( + dataset, pl.dataframe.frame.DataFrame + ): + # Converting polars dataframe to pandas because currently we support only pandas UDF's as transformation functions. + if PYARROW_EXTENSION_ENABLE: + dataset = dataset.to_pandas( + use_pyarrow_extension_array=True + ) # Zero copy if pyarrow extension can be used. else: - dataset[feature_name] = dataset[feature_name].map( - transformation_fn.transformation_fn - ) - # The below functions is not required for Polars since polars does have object types like pandas - if not ( - isinstance(dataset, pl.DataFrame) - or isinstance(dataset, pl.dataframe.frame.DataFrame) - ): - offline_type = Engine.convert_spark_type_to_offline_type( - transformation_fn.output_type - ) - dataset[feature_name] = Engine._cast_column_to_offline_type( - dataset[feature_name], offline_type - ) + dataset = dataset.to_pandas(use_pyarrow_extension_array=False) + for tf in transformation_functions: + hopsworks_udf = tf.hopsworks_udf + missing_features = set(hopsworks_udf.transformation_features) - set( + dataset.columns + ) + if missing_features: + raise FeatureStoreException( + f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly." + ) + if tf.hopsworks_udf.dropped_features: + dropped_features.update(tf.hopsworks_udf.dropped_features) + dataset = pd.concat( + [ + dataset, + tf.hopsworks_udf.get_udf()( + *( + [ + dataset[feature] + for feature in tf.hopsworks_udf.transformation_features + ] + ) + ), + ], + axis=1, + ) + dataset = dataset.drop(dropped_features, axis=1) return dataset @staticmethod @@ -1439,8 +1560,11 @@ def acked(err: Exception, msg: Any) -> None: elif not isinstance( feature_group, ExternalFeatureGroup ) and self._start_offline_materialization(offline_write_options): - if (not offline_write_options.get("skip_offsets", False) - and self._job_api.last_execution(feature_group.materialization_job)): # always skip offsets if executing job for the first time + if not offline_write_options.get( + "skip_offsets", False + ) and self._job_api.last_execution( + feature_group.materialization_job + ): # always skip offsets if executing job for the first time # don't provide the current offsets (read from where the job last left off) initial_check_point = "" # provide the initial_check_point as it will reduce the read amplification of materialization job diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py index b9f8621cfc..322e9e993a 100644 --- a/python/hsfs/engine/spark.py +++ b/python/hsfs/engine/spark.py @@ -23,12 +23,13 @@ import shutil import warnings from datetime import date, datetime, timezone -from typing import Any, List, Optional, TypeVar, Union +from typing import Any, Dict, List, Optional, TypeVar, Union import avro import numpy as np import pandas as pd import tzlocal +from hsfs.constructor import query # in case importing in %%local from hsfs.core.vector_db_client import VectorDbClient @@ -82,11 +83,18 @@ def iteritems(self): DataContextConfig, InMemoryStoreBackendDefaults, ) -from hsfs import client, feature, training_dataset_feature, util +from hsfs import ( + client, + feature, + feature_view, + training_dataset, + training_dataset_feature, + transformation_function, + util, +) from hsfs import feature_group as fg_mod from hsfs.client import hopsworks from hsfs.client.exceptions import FeatureStoreException -from hsfs.constructor import query from hsfs.core import ( dataset_api, delta_engine, @@ -152,7 +160,14 @@ def show(self, sql_query, feature_store, n, online_conn, read_options=None): sql_query, feature_store, online_conn, "default", read_options ).show(n) - def read_vector_db(self, feature_group: fg_mod.FeatureGroup, n: int =None, dataframe_type: str="default") -> Union[pd.DataFrame, np.ndarray, List[List[Any]], TypeVar("pyspark.sql.DataFrame")]: + def read_vector_db( + self, + feature_group: fg_mod.FeatureGroup, + n: int = None, + dataframe_type: str = "default", + ) -> Union[ + pd.DataFrame, np.ndarray, List[List[Any]], TypeVar("pyspark.sql.DataFrame") + ]: results = VectorDbClient.read_feature_group(feature_group, n) feature_names = [f.name for f in feature_group.features] dataframe_type = dataframe_type.lower() @@ -336,6 +351,10 @@ def save_dataframe( validation_id=None, ): try: + if feature_group.transformation_functions: + dataframe = self._apply_transformation_function( + feature_group.transformation_functions, dataframe + ) if ( isinstance(feature_group, fg_mod.ExternalFeatureGroup) and feature_group.online_enabled @@ -380,6 +399,11 @@ def save_stream_dataframe( checkpoint_dir, write_options, ): + if feature_group.transformation_functions: + dataframe = self._apply_transformation_function( + feature_group.transformation_functions, dataframe + ) + write_options = self._get_kafka_config( feature_group.feature_store_id, write_options ) @@ -542,12 +566,26 @@ def _online_fg_to_avro(self, feature_group, dataframe): def get_training_data( self, - training_dataset, - feature_view_obj, - query_obj, - read_options, - dataframe_type, + training_dataset: training_dataset.TrainingDataset, + feature_view_obj: feature_view.FeatureView, + query_obj: query.Query, + read_options: Dict[str, Any], + dataframe_type: str, + training_dataset_version: int = None, ): + """ + Function that creates or retrieves already created the training dataset. + + # Arguments + training_dataset_obj `TrainingDataset`: The training dataset metadata object. + feature_view_obj `FeatureView`: The feature view object for the which the training data is being created. + query_obj `Query`: The query object that contains the query used to create the feature view. + read_options `Dict[str, Any]`: Dictionary that can be used to specify extra parameters for reading data. + dataframe_type `str`: The type of dataframe returned. + training_dataset_version `int`: Version of training data to be retrieved. + # Raises + `ValueError`: If the training dataset statistics could not be retrieved. + """ return self.write_training_dataset( training_dataset, query_obj, @@ -556,6 +594,7 @@ def get_training_data( read_options=read_options, to_df=True, feature_view_obj=feature_view_obj, + training_dataset_version=training_dataset_version, ) def split_labels(self, df, labels, dataframe_type): @@ -578,14 +617,30 @@ def drop_columns(self, df, drop_cols): def write_training_dataset( self, - training_dataset, - query_obj, - user_write_options, - save_mode, - read_options=None, - feature_view_obj=None, - to_df=False, + training_dataset: training_dataset.TrainingDataset, + query_obj: query.Query, + user_write_options: Dict[str, Any], + save_mode: str, + read_options: Dict[str, Any] = None, + feature_view_obj: feature_view.FeatureView = None, + to_df: bool = False, + training_dataset_version: Optional[int] = None, ): + """ + Function that creates or retrieves already created the training dataset. + + # Arguments + training_dataset `TrainingDataset`: The training dataset metadata object. + query_obj `Query`: The query object that contains the query used to create the feature view. + user_write_options `Dict[str, Any]`: Dictionary that can be used to specify extra parameters for writing data using spark. + save_mode `str`: Spark save mode to be used while writing data. + read_options `Dict[str, Any]`: Dictionary that can be used to specify extra parameters for reading data. + feature_view_obj `FeatureView`: The feature view object for the which the training data is being created. + to_df `bool`: Return dataframe instead of writing the data. + training_dataset_version `Optional[int]`: Version of training data to be retrieved. + # Raises + `ValueError`: If the training dataset statistics could not be retrieved. + """ write_options = self.write_options( training_dataset.data_format, user_write_options ) @@ -600,14 +655,20 @@ def write_training_dataset( else: raise ValueError("Dataset should be a query.") - transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions( + # if training_dataset_version is None: + transformation_function_engine.TransformationFunctionEngine.compute_and_set_feature_statistics( training_dataset, feature_view_obj, dataset ) + # else: + # transformation_function_engine.TransformationFunctionEngine.get_and_set_feature_statistics( + # training_dataset, feature_view_obj, training_dataset_version + # ) + if training_dataset.coalesce: dataset = dataset.coalesce(1) path = training_dataset.location + "/" + training_dataset.name return self._write_training_dataset_single( - training_dataset.transformation_functions, + feature_view_obj.transformation_functions, dataset, training_dataset.storage_connector, training_dataset.data_format, @@ -626,11 +687,22 @@ def write_training_dataset( split_dataset[key] = split_dataset[key].cache() - transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions( - training_dataset, feature_view_obj, split_dataset - ) + if training_dataset_version is None: + transformation_function_engine.TransformationFunctionEngine.compute_and_set_feature_statistics( + training_dataset, feature_view_obj, split_dataset + ) + else: + transformation_function_engine.TransformationFunctionEngine.get_and_set_feature_statistics( + training_dataset, feature_view_obj, training_dataset_version + ) + return self._write_training_dataset_splits( - training_dataset, split_dataset, write_options, save_mode, to_df=to_df + training_dataset, + split_dataset, + write_options, + save_mode, + to_df=to_df, + transformation_functions=feature_view_obj.transformation_functions, ) def _split_df(self, query_obj, training_dataset, read_options=None): @@ -782,11 +854,14 @@ def _write_training_dataset_splits( write_options, save_mode, to_df=False, + transformation_functions: List[ + transformation_function.TransformationFunction + ] = None, ): for split_name, feature_dataframe in feature_dataframes.items(): split_path = training_dataset.location + "/" + str(split_name) feature_dataframes[split_name] = self._write_training_dataset_single( - training_dataset.transformation_functions, + transformation_functions, feature_dataframe, training_dataset.storage_connector, training_dataset.data_format, @@ -1049,8 +1124,30 @@ def read_options(self, data_format, provided_options): options.update(provided_options) return options - def parse_schema_feature_group(self, dataframe, time_travel_format=None): + def parse_schema_feature_group( + self, + dataframe, + time_travel_format=None, + transformation_functions: Optional[ + List[transformation_function.TransformationFunction] + ] = None, + ): features = [] + transformed_features = [] + dropped_features = [] + + if transformation_functions: + for tf in transformation_functions: + transformed_features.append( + feature.Feature( + tf.hopsworks_udf.output_column_names[0], + tf.hopsworks_udf.return_types[0], + on_demand=True, + ) + ) + if tf.hopsworks_udf.dropped_features: + dropped_features.extend(tf.hopsworks_udf.dropped_features) + using_hudi = time_travel_format == "HUDI" for feat in dataframe.schema: name = util.autofix_feature_name(feat.name) @@ -1060,12 +1157,13 @@ def parse_schema_feature_group(self, dataframe, time_travel_format=None): ) except ValueError as e: raise FeatureStoreException(f"Feature '{feat.name}': {str(e)}") from e - features.append( - feature.Feature( - name, converted_type, feat.metadata.get("description", None) + if name not in dropped_features: + features.append( + feature.Feature( + name, converted_type, feat.metadata.get("description", None) + ) ) - ) - return features + return features + transformed_features def parse_schema_training_dataset(self, dataframe): return [ @@ -1162,67 +1260,68 @@ def add_cols_to_delta_table(self, feature_group, new_features): "spark.databricks.delta.schema.autoMerge.enabled", "true" ).save(feature_group.location) - def _apply_transformation_function(self, transformation_functions, dataset): - # generate transformation function expressions - transformed_feature_names = [] - transformation_fn_expressions = [] - for ( - feature_name, - transformation_fn, - ) in transformation_functions.items(): - fn_registration_name = ( - transformation_fn.name - + "_" - + str(transformation_fn.version) - + "_" - + feature_name + def _apply_transformation_function( + self, + transformation_functions: List[transformation_function.TransformationFunction], + dataset: DataFrame, + ): + """ + Apply transformation function to the dataframe. + + # Arguments + transformation_functions `List[TransformationFunction]` : List of transformation functions. + dataset `Union[DataFrame]`: A spark dataframe. + # Returns + `DataFrame`: A spark dataframe with the transformed data. + # Raises + `FeatureStoreException`: If any of the features mentioned in the transformation function is not present in the Feature View. + """ + dropped_features = set() + transformations = [] + transformation_features = [] + output_col_names = [] + explode_name = [] + for tf in transformation_functions: + hopsworks_udf = tf.hopsworks_udf + missing_features = set(hopsworks_udf.transformation_features) - set( + dataset.columns ) - def timezone_decorator(func, trans_fn=transformation_fn): - if trans_fn.output_type != "TIMESTAMP": - return func - - current_timezone = tzlocal.get_localzone() - - def decorated_func(x): - result = func(x) - if isinstance(result, datetime): - if result.tzinfo is None: - # if timestamp is timezone unaware, make sure it's localized to the system's timezone. - # otherwise, spark will implicitly convert it to the system's timezone. - return result.replace(tzinfo=current_timezone) - else: - # convert to utc, then localize to system's timezone - return result.astimezone(timezone.utc).replace( - tzinfo=current_timezone - ) - return result - - return decorated_func - - self._spark_session.udf.register( - fn_registration_name, - timezone_decorator(transformation_fn.transformation_fn), - transformation_fn.output_type, - ) - transformation_fn_expressions.append( - "{fn_name:}({name:}) AS {name:}".format( - fn_name=fn_registration_name, name=feature_name + if missing_features: + raise FeatureStoreException( + f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly." ) - ) - transformed_feature_names.append(feature_name) + if tf.hopsworks_udf.dropped_features: + dropped_features.update(tf.hopsworks_udf.dropped_features) - # generate non transformation expressions - no_transformation_expr = [ - "{name:} AS {name:}".format(name=col_name) - for col_name in dataset.columns - if col_name not in transformed_feature_names - ] + pandas_udf = hopsworks_udf.get_udf() + output_col_name = hopsworks_udf.output_column_names[0] + + transformations.append(pandas_udf) + output_col_names.append(output_col_name) + transformation_features.append(hopsworks_udf.transformation_features) + + if len(hopsworks_udf.return_types) > 1: + explode_name.append(f"{output_col_name}.*") + else: + explode_name.append(output_col_name) + + untransformed_columns = [] # Untransformed column maintained as a list since order is imported while selecting features. + for column in dataset.columns: + if column not in dropped_features: + untransformed_columns.append(column) + # Applying transformations + transformed_dataset = dataset.select( + *untransformed_columns, + *[ + fun(*feature).alias(output_col_name) + for fun, feature, output_col_name in zip( + transformations, transformation_features, output_col_names + ) + ], + ).select(*untransformed_columns, *explode_name) - # generate entire expression and execute it - transformation_fn_expressions.extend(no_transformation_expr) - transformed_dataset = dataset.selectExpr(*transformation_fn_expressions) - return transformed_dataset.select(*dataset.columns) + return transformed_dataset def _setup_gcp_hadoop_conf(self, storage_connector, path): PROPERTY_ENCRYPTION_KEY = "fs.gs.encryption.key" diff --git a/python/hsfs/feature.py b/python/hsfs/feature.py index 89f19b060d..412929a75e 100644 --- a/python/hsfs/feature.py +++ b/python/hsfs/feature.py @@ -53,6 +53,7 @@ def __init__( "hsfs.feature_group.SpineGroup", ] ] = None, + on_demand: bool = False, **kwargs, ) -> None: self._name = util.autofix_feature_name(name) @@ -67,6 +68,7 @@ def __init__( self._feature_group_id = feature_group.id else: self._feature_group_id = feature_group_id + self._on_demand = on_demand def to_dict(self) -> Dict[str, Any]: """Get structured info about specific Feature in python dictionary format. @@ -93,6 +95,7 @@ def to_dict(self) -> Dict[str, Any]: "onlineType": self._online_type, "defaultValue": self._default_value, "featureGroupId": self._feature_group_id, + "onDemand": self.on_demand, } def json(self) -> str: @@ -206,6 +209,15 @@ def default_value(self, default_value: Optional[str]) -> None: def feature_group_id(self) -> Optional[int]: return self._feature_group_id + @property + def on_demand(self) -> bool: + """Whether the feature is a on-demand feature computed using on-demand transformation functions""" + return self._on_demand + + @on_demand.setter + def on_demand(self, on_demand) -> None: + self._on_demand = on_demand + def __lt__(self, other: Any) -> "filter.Filter": return filter.Filter(self, filter.Filter.LT, other) diff --git a/python/hsfs/feature_group.py b/python/hsfs/feature_group.py index de5577417c..8240f115e9 100644 --- a/python/hsfs/feature_group.py +++ b/python/hsfs/feature_group.py @@ -73,8 +73,10 @@ from hsfs.embedding import EmbeddingIndex from hsfs.expectation_suite import ExpectationSuite from hsfs.ge_validation_result import ValidationResult +from hsfs.hopsworks_udf import HopsworksUdf, UDFType from hsfs.statistics import Statistics from hsfs.statistics_config import StatisticsConfig +from hsfs.transformation_function import TransformationFunction from hsfs.validation_report import ValidationReport @@ -543,8 +545,13 @@ def get_storage_connector(self): """ storage_connector_provenance = self.get_storage_connector_provenance() - if storage_connector_provenance.inaccessible or storage_connector_provenance.deleted: - _logger.info("The parent storage connector is deleted or inaccessible. For more details access `get_storage_connector_provenance`") + if ( + storage_connector_provenance.inaccessible + or storage_connector_provenance.deleted + ): + _logger.info( + "The parent storage connector is deleted or inaccessible. For more details access `get_storage_connector_provenance`" + ) if storage_connector_provenance.accessible: return storage_connector_provenance.accessible[0] @@ -2022,6 +2029,9 @@ def __init__( Union[Dict[str, Any], "deltastreamer_jobconf.DeltaStreamerJobConf"] ] = None, deprecated: bool = False, + transformation_functions: Optional[ + List[Union[TransformationFunction, HopsworksUdf]] + ] = None, **kwargs, ) -> None: super().__init__( @@ -2124,6 +2134,48 @@ def __init__( self._feature_writers: Optional[Dict[str, callable]] = None self._writer: Optional[callable] = None + # On-Demand Transformation Functions + self._transformation_functions: List[TransformationFunction] = [] + + if transformation_functions: + for transformation_function in transformation_functions: + if not isinstance(transformation_function, TransformationFunction): + self._transformation_functions.append( + TransformationFunction( + featurestore_id, + hopsworks_udf=transformation_function, + version=1, + transformation_type=UDFType.ON_DEMAND, + ) + ) + else: + if not transformation_function.hopsworks_udf.udf_type: + transformation_function.hopsworks_udf.udf_type = ( + UDFType.ON_DEMAND + ) + self._transformation_functions.append(transformation_function) + + if self._transformation_functions: + self._transformation_functions = ( + FeatureGroup._sort_transformation_functions( + self._transformation_functions + ) + ) + + @staticmethod + def _sort_transformation_functions( + transformation_functions: List[TransformationFunction], + ) -> List[TransformationFunction]: + """ + Function that sorts transformation functions in the order of the output column names. + The list of transformation functions are sorted based on the output columns names to maintain consistent ordering. + # Arguments + transformation_functions: `List[TransformationFunction]`. List of transformation functions to be sorted + # Returns + `List[TransformationFunction]`: List of transformation functions to be sorted + """ + return sorted(transformation_functions, key=lambda x: x.output_column_names[0]) + def read( self, wallclock_time: Optional[Union[str, int, datetime, date]] = None, @@ -3204,6 +3256,17 @@ def from_response_json( json_decamelized["embedding_index"] = EmbeddingIndex.from_response_json( json_decamelized["embedding_index"] ) + if "transformation_functions" in json_decamelized: + transformation_functions = json_decamelized["transformation_functions"] + json_decamelized["transformation_functions"] = [ + TransformationFunction.from_response_json( + { + **transformation_function, + "transformation_type": UDFType.ON_DEMAND, + } + ) + for transformation_function in transformation_functions + ] return cls(**json_decamelized) for fg in json_decamelized: if "type" in fg: @@ -3214,6 +3277,17 @@ def from_response_json( fg["embedding_index"] = EmbeddingIndex.from_response_json( fg["embedding_index"] ) + if "transformation_functions" in fg: + transformation_functions = fg["transformation_functions"] + fg["transformation_functions"] = [ + TransformationFunction.from_response_json( + { + **transformation_function, + "transformation_type": UDFType.ON_DEMAND, + } + ) + for transformation_function in transformation_functions + ] return [cls(**fg) for fg in json_decamelized] def update_from_response_json(self, json_dict: Dict[str, Any]) -> "FeatureGroup": @@ -3224,6 +3298,17 @@ def update_from_response_json(self, json_dict: Dict[str, Any]) -> "FeatureGroup" json_decamelized["embedding_index"] = EmbeddingIndex.from_response_json( json_decamelized["embedding_index"] ) + if "transformation_functions" in json_decamelized: + transformation_functions = json_decamelized["transformation_functions"] + json_decamelized["transformation_functions"] = [ + TransformationFunction.from_response_json( + { + **transformation_function, + "transformation_type": UDFType.ON_DEMAND, + } + ) + for transformation_function in transformation_functions + ] self.__init__(**json_decamelized) return self @@ -3270,6 +3355,7 @@ def to_dict(self) -> Dict[str, Any]: "topicName": self.topic_name, "notificationTopicName": self.notification_topic_name, "deprecated": self.deprecated, + "transformationFunctions": self._transformation_functions, } if self.embedding_index: fg_meta_dict["embeddingIndex"] = self.embedding_index.to_dict() @@ -3376,6 +3462,13 @@ def statistics(self) -> "Statistics": ) return super().statistics + @property + def transformation_functions( + self, + ) -> List[TransformationFunction]: + """Get transformation functions.""" + return self._transformation_functions + @description.setter def description(self, new_description: Optional[str]) -> None: self._description = new_description @@ -3402,6 +3495,13 @@ def stream(self, stream: bool) -> None: def parents(self, new_parents: "explicit_provenance.Links") -> None: self._parents = new_parents + @transformation_functions.setter + def transformation_functions( + self, + transformation_functions: List[TransformationFunction], + ) -> None: + self._transformation_functions = transformation_functions + @typechecked class ExternalFeatureGroup(FeatureGroupBase): diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py index c8a18dc6c0..2ec47f312e 100644 --- a/python/hsfs/feature_store.py +++ b/python/hsfs/feature_store.py @@ -22,7 +22,6 @@ import great_expectations as ge import humps -import numpy import numpy as np import pandas as pd import polars as pl @@ -49,6 +48,7 @@ ) from hsfs.decorators import typechecked from hsfs.embedding import EmbeddingIndex +from hsfs.hopsworks_udf import HopsworksUdf from hsfs.statistics_config import StatisticsConfig from hsfs.transformation_function import TransformationFunction @@ -510,6 +510,9 @@ def create_feature_group( parents: Optional[List[feature_group.FeatureGroup]] = None, topic_name: Optional[str] = None, notification_topic_name: Optional[str] = None, + transformation_functions: Optional[ + List[Union[TransformationFunction, HopsworksUdf]] + ] = None, ) -> "feature_group.FeatureGroup": """Create a feature group metadata object. @@ -518,13 +521,26 @@ def create_feature_group( # connect to the Feature Store fs = ... + # define the on-demand transformation functions + @udf(int) + def plus_one(value): + return value + 1 + + @udf(int) + def plus_two(value): + return value + 2 + + # construct list of "transformation functions" on features + transformation_functions = [plus_one("feature1"), plus_two("feature2"))] + fg = fs.create_feature_group( name='air_quality', description='Air Quality characteristics of each day', version=1, primary_key=['city','date'], online_enabled=True, - event_time='date' + event_time='date', + transformation_functions=transformation_functions ) ``` @@ -592,6 +608,9 @@ def create_feature_group( defaults to using project topic. notification_topic_name: Optionally, define the name of the topic used for sending notifications when entries are inserted or updated on the online feature store. If left undefined no notifications are sent. + transformation_functions: On-Demand Transformation functions attached to the feature group. + It can be a list of list of user defined functions defined using the hopsworks `@udf` decorator. + Defaults to `None`, no transformations. # Returns `FeatureGroup`. The feature group metadata object. @@ -616,6 +635,7 @@ def create_feature_group( parents=parents or [], topic_name=topic_name, notification_topic_name=notification_topic_name, + transformation_functions=transformation_functions, ) feature_group_object.feature_store = self return feature_group_object @@ -642,6 +662,9 @@ def get_or_create_feature_group( parents: Optional[List[feature_group.FeatureGroup]] = None, topic_name: Optional[str] = None, notification_topic_name: Optional[str] = None, + transformation_functions: Optional[ + List[Union[TransformationFunction, HopsworksUdf]] + ] = None, ) -> Union[ "feature_group.FeatureGroup", "feature_group.ExternalFeatureGroup", @@ -661,6 +684,7 @@ def get_or_create_feature_group( primary_key=["day", "area"], online_enabled=True, event_time="timestamp", + transformation_functions=transformation_functions, ) ``` @@ -726,6 +750,9 @@ def get_or_create_feature_group( defaults to using project topic. notification_topic_name: Optionally, define the name of the topic used for sending notifications when entries are inserted or updated on the online feature store. If left undefined no notifications are sent. + transformation_functions: On-Demand Transformation functions attached to the feature group. + It can be a list of list of user defined functions defined using the hopsworks `@udf` decorator. + Defaults to `None`, no transformations. # Returns `FeatureGroup`. The feature group metadata object. @@ -759,6 +786,7 @@ def get_or_create_feature_group( parents=parents or [], topic_name=topic_name, notification_topic_name=notification_topic_name, + transformation_functions=transformation_functions, ) feature_group_object.feature_store = self return feature_group_object @@ -1278,36 +1306,21 @@ def create_training_dataset( @usage.method_logger def create_transformation_function( self, - transformation_function: callable, - output_type: Union[ - str, - bytes, - int, - numpy.int8, - numpy.int16, - numpy.int32, - numpy.int64, - float, - numpy.float64, - datetime.datetime, - numpy.datetime64, - datetime.date, - bool, - ], + transformation_function: HopsworksUdf, version: Optional[int] = None, ) -> "TransformationFunction": """Create a transformation function metadata object. !!! example ```python - # define function + # define the transformation function as a Hopsworks's UDF + @udf(int) def plus_one(value): return value + 1 # create transformation function plus_one_meta = fs.create_transformation_function( transformation_function=plus_one, - output_type=int, version=1 ) @@ -1321,16 +1334,14 @@ def plus_one(value): call the `save()` method of the transformation function metadata object. # Arguments - transformation_function: callable object. - output_type: python or numpy output type that will be inferred as pyspark.sql.types type. + transformation_function: Hopsworks UDF. # Returns: `TransformationFunction`: The TransformationFunction metadata object. """ return TransformationFunction( featurestore_id=self._id, - transformation_fn=transformation_function, - output_type=output_type, + hopsworks_udf=transformation_function, version=version, ) @@ -1388,9 +1399,7 @@ def get_transformation_function( name='feature_view_name', query=query, labels=["target_column"], - transformation_functions={ - "column_to_transform": min_max_scaler - } + transformation_functions=[min_max_scaler("feature1")] ) ``` @@ -1417,12 +1426,12 @@ def get_transformation_function( name='transactions_view', query=query, labels=["fraud_label"], - transformation_functions = { - "category_column": label_encoder, - "weight": robust_scaler, - "age": min_max_scaler, - "salary": standard_scaler - } + transformation_functions = [ + label_encoder("category_column"), + robust_scaler("weight"), + min_max_scaler("age"), + standard_scaler("salary") + ] ) ``` @@ -1464,7 +1473,9 @@ def create_feature_view( labels: Optional[List[str]] = None, inference_helper_columns: Optional[List[str]] = None, training_helper_columns: Optional[List[str]] = None, - transformation_functions: Optional[Dict[str, TransformationFunction]] = None, + transformation_functions: Optional[ + List[Union[TransformationFunction, HopsworksUdf]] + ] = None, ) -> feature_view.FeatureView: """Create a feature view metadata object and saved it to hopsworks. @@ -1480,11 +1491,13 @@ def create_feature_view( # construct the query query = fg1.select_all().join(fg2.select_all()) - # get the transformation functions - standard_scaler = fs.get_transformation_function(name='standard_scaler') + # define the transformation function as a Hopsworks's UDF + @udf(int) + def plus_one(value): + return value + 1 - # construct dictionary of "feature - transformation function" pairs - transformation_functions = {col_name: standard_scaler for col_name in df.columns} + # construct list of "transformation functions" on features + transformation_functions = [plus_one("feature1"), plus_one("feature1"))] feature_view = fs.create_feature_view( name='air_quality_fv', @@ -1502,7 +1515,7 @@ def create_feature_view( # define query object query = ... - # define dictionary with column names and transformation functions pairs + # define list of transformation functions mapping_transformers = ... # create feature view @@ -1548,10 +1561,9 @@ def create_feature_view( Training helper columns can be optionally fetched with training data. For more details see documentation for feature view's get training data methods. Defaults to `[], no training helper columns. - transformation_functions: A dictionary mapping tansformation functions to - to the features they should be applied to before writing out the - vector and at inference time. Defaults to `{}`, no - transformations. + transformation_functions: Model Dependent Transformation functions attached to the feature view. + It can be a list of list of user defined functions defined using the hopsworks `@udf` decorator. + Defaults to `None`, no transformations. # Returns: `FeatureView`: The feature view metadata object. @@ -1626,10 +1638,9 @@ def get_or_create_feature_view( Training helper columns can be optionally fetched with training data. For more details see documentation for feature view's get training data methods. Defaults to `[], no training helper columns. - transformation_functions: A dictionary mapping tansformation functions to - to the features they should be applied to before writing out the - vector and at inference time. Defaults to `{}`, no - transformations. + transformation_functions: Model Dependent Transformation functions attached to the feature view. + It can be a list of list of user defined functions defined using the hopsworks `@udf` decorator. + Defaults to `None`, no transformations. # Returns: `FeatureView`: The feature view metadata object. @@ -1649,7 +1660,7 @@ def get_or_create_feature_view( labels=labels or [], inference_helper_columns=inference_helper_columns or [], training_helper_columns=training_helper_columns or [], - transformation_functions=transformation_functions or {}, + transformation_functions=transformation_functions or [], ) else: raise e diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py index 4f6a9dbb8e..fc9151ae94 100644 --- a/python/hsfs/feature_view.py +++ b/python/hsfs/feature_view.py @@ -15,7 +15,6 @@ # from __future__ import annotations -import copy import json import logging import warnings @@ -36,7 +35,6 @@ util, ) from hsfs import serving_key as skm -from hsfs import transformation_function as tfm from hsfs.client.exceptions import FeatureStoreException from hsfs.constructor import filter, query from hsfs.constructor.filter import Filter, Logic @@ -56,9 +54,11 @@ from hsfs.core.vector_db_client import VectorDbClient from hsfs.decorators import typechecked from hsfs.feature import Feature +from hsfs.hopsworks_udf import HopsworksUdf, UDFType from hsfs.statistics import Statistics from hsfs.statistics_config import StatisticsConfig from hsfs.training_dataset_split import TrainingDatasetSplit +from hsfs.transformation_function import TransformationFunction _logger = logging.getLogger(__name__) @@ -98,7 +98,7 @@ def __init__( inference_helper_columns: Optional[List[str]] = None, training_helper_columns: Optional[List[str]] = None, transformation_functions: Optional[ - Dict[str, tfm.TransformationFunction] + List[Union[TransformationFunction, HopsworksUdf]] ] = None, featurestore_name: Optional[str] = None, serving_keys: Optional[List[skm.ServingKey]] = None, @@ -119,14 +119,32 @@ def __init__( self._training_helper_columns = ( training_helper_columns if training_helper_columns else [] ) - self._transformation_functions = ( - { - ft_name: copy.deepcopy(transformation_functions[ft_name]) - for ft_name in transformation_functions - } - if transformation_functions - else {} - ) + + self._transformation_functions: List[TransformationFunction] = [] + + if transformation_functions: + for transformation_function in transformation_functions: + if not isinstance(transformation_function, TransformationFunction): + self._transformation_functions.append( + TransformationFunction( + self.featurestore_id, + hopsworks_udf=transformation_function, + version=1, + transformation_type=UDFType.MODEL_DEPENDENT, + ) + ) + else: + if not transformation_function.hopsworks_udf.udf_type: + transformation_function.hopsworks_udf.udf_type = ( + UDFType.MODEL_DEPENDENT + ) + self._transformation_functions.append(transformation_function) + + if self._transformation_functions: + self._transformation_functions = FeatureView._sort_transformation_functions( + self._transformation_functions + ) + self._features = [] self._feature_view_engine: feature_view_engine.FeatureViewEngine = ( feature_view_engine.FeatureViewEngine(featurestore_id) @@ -370,6 +388,23 @@ def init_serving( self.query, serving_keys=self._serving_keys ) + @staticmethod + def _sort_transformation_functions( + transformation_functions: List[TransformationFunction], + ) -> List[TransformationFunction]: + """ + Function that sorts transformation functions in the order of the output column names. + + The list of transformation functions are sorted based on the output columns names to maintain consistent ordering. + + # Arguments + transformation_functions: `List[TransformationFunction]`. List of transformation functions to be sorted + + # Returns + `List[TransformationFunction]`: List of transformation functions to be sorted + """ + return sorted(transformation_functions, key=lambda x: x.output_column_names[0]) + def init_batch_scoring( self, training_dataset_version: Optional[int] = None, @@ -463,6 +498,7 @@ def get_feature_vector( allow_missing: bool = False, force_rest_client: bool = False, force_sql_client: bool = False, + request_parameters: Optional[Dict[str, Any]] = None, ) -> Union[List[Any], pd.DataFrame, np.ndarray, pl.DataFrame]: """Returns assembled feature vector from online feature store. Call [`feature_view.init_serving`](#init_serving) before this method if the following configurations are needed. @@ -536,6 +572,7 @@ def get_feature_vector( force_sql_client: boolean, defaults to False. If set to True, reads from online feature store using the SQL client if initialised. allow_missing: Setting to `True` returns feature vectors with missing values. + request_parameters: Request parameters required by on-demand transformation functions to compute on-demand features present in the feature view. # Returns `list`, `pd.DataFrame`, `polars.DataFrame` or `np.ndarray` if `return type` is set to `"list"`, `"pandas"`, `"polars"` or `"numpy"` @@ -561,6 +598,7 @@ def get_feature_vector( vector_db_features=vector_db_features, force_rest_client=force_rest_client, force_sql_client=force_sql_client, + request_parameters=request_parameters, ) def get_feature_vectors( @@ -572,6 +610,7 @@ def get_feature_vectors( allow_missing: bool = False, force_rest_client: bool = False, force_sql_client: bool = False, + request_parameters: Optional[List[Dict[str, Any]]] = None, ) -> Union[List[List[Any]], pd.DataFrame, np.ndarray, pl.DataFrame]: """Returns assembled feature vectors in batches from online feature store. Call [`feature_view.init_serving`](#init_serving) before this method if the following configurations are needed. @@ -643,6 +682,7 @@ def get_feature_vectors( force_rest_client: boolean, defaults to False. If set to True, reads from online feature store using the REST client if initialised. allow_missing: Setting to `True` returns feature vectors with missing values. + request_parameters: Request parameters required by on-demand transformation functions to compute on-demand features present in the feature view. # Returns `List[list]`, `pd.DataFrame`, `polars.DataFrame` or `np.ndarray` if `return type` is set to `"list", `"pandas"`,`"polars"` or `"numpy"` @@ -670,6 +710,7 @@ def get_feature_vectors( vector_db_features=vector_db_features, force_rest_client=force_rest_client, force_sql_client=force_sql_client, + request_parameters=request_parameters, ) def get_inference_helper( @@ -823,6 +864,7 @@ def find_neighbors( the number of results returned may be less than k. Try using a large value of k and extract the top k items from the results if needed. + # Arguments embedding: The target embedding for which neighbors are to be found. feature: The feature used to compute similarity score. Required only if there @@ -994,7 +1036,7 @@ def get_batch_data( start_time, end_time, self._batch_scoring_server.training_dataset_version, - self._batch_scoring_server._transformation_functions, + self._batch_scoring_server._model_dependent_transformation_functions, read_options, spine, primary_keys, @@ -3386,6 +3428,14 @@ def create_feature_monitoring( @classmethod def from_response_json(cls, json_dict: Dict[str, Any]) -> "FeatureView": + """ + Function that constructs the class object from its json serialization. + + # Arguments + json_dict: `Dict[str, Any]`. Json serialized dictionary for the class. + # Returns + `TransformationFunction`: Json deserialized class object. + """ json_decamelized = humps.decamelize(json_dict) serving_keys = json_decamelized.get("serving_keys", None) @@ -3393,6 +3443,7 @@ def from_response_json(cls, json_dict: Dict[str, Any]) -> "FeatureView": serving_keys = [ skm.ServingKey.from_response_json(sk) for sk in serving_keys ] + transformation_functions = json_decamelized.get("transformation_functions", {}) fv = cls( id=json_decamelized.get("id", None), name=json_decamelized["name"], @@ -3402,6 +3453,17 @@ def from_response_json(cls, json_dict: Dict[str, Any]) -> "FeatureView": description=json_decamelized.get("description", None), featurestore_name=json_decamelized.get("featurestore_name", None), serving_keys=serving_keys, + transformation_functions=[ + TransformationFunction.from_response_json( + { + **transformation_function, + "transformation_type": UDFType.MODEL_DEPENDENT, + } + ) + for transformation_function in transformation_functions + ] + if transformation_functions + else [], ) features = json_decamelized.get("features", []) if features: @@ -3423,6 +3485,14 @@ def from_response_json(cls, json_dict: Dict[str, Any]) -> "FeatureView": return fv def update_from_response_json(self, json_dict: Dict[str, Any]) -> "FeatureView": + """ + Function that updates the class object from its json serialization. + + # Arguments + json_dict: `Dict[str, Any]`. Json serialized dictionary for the class. + # Returns + `TransformationFunction`: Json deserialized class object. + """ other = self.from_response_json(json_dict) for key in [ "name", @@ -3434,6 +3504,7 @@ def update_from_response_json(self, json_dict: Dict[str, Any]) -> "FeatureView": "labels", "inference_helper_columns", "training_helper_columns", + "transformation_functions", "schema", "serving_keys", ]: @@ -3463,9 +3534,21 @@ def _init_feature_monitoring_engine(self) -> None: ) def json(self) -> str: + """ + Convert class into its json serialized form. + + # Returns + `str`: Json serialized object. + """ return json.dumps(self, cls=util.FeatureStoreEncoder) def to_dict(self) -> Dict[str, Any]: + """ + Convert class into a dictionary. + + # Returns + `Dict`: Dictionary that contains all data required to json serialize the object. + """ return { "featurestoreId": self._featurestore_id, "name": self._name, @@ -3473,6 +3556,7 @@ def to_dict(self) -> Dict[str, Any]: "description": self._description, "query": self._query, "features": self._features, + "transformationFunctions": self._transformation_functions, "type": "featureViewDTO", } @@ -3578,14 +3662,14 @@ def query(self, query_obj: "query.Query") -> None: @property def transformation_functions( self, - ) -> Dict[str, tfm.TransformationFunction]: + ) -> List[TransformationFunction]: """Get transformation functions.""" return self._transformation_functions @transformation_functions.setter def transformation_functions( self, - transformation_functions: Dict[str, tfm.TransformationFunction], + transformation_functions: List[TransformationFunction], ) -> None: self._transformation_functions = transformation_functions @@ -3642,3 +3726,18 @@ def serving_keys(self) -> List[skm.ServingKey]: @serving_keys.setter def serving_keys(self, serving_keys: List[skm.ServingKey]) -> None: self._serving_keys = serving_keys + + @property + def transformed_features(self) -> List[str]: + """Name of features of a feature view after transformation functions have been applied""" + transformation_features = set() + transformed_column_names = [] + for tf in self.transformation_functions: + transformed_column_names.extend(tf.output_column_names) + transformation_features.update(tf.hopsworks_udf.transformation_features) + + return [ + feature.name + for feature in self.features + if feature.name not in transformation_features + ] + transformed_column_names diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py new file mode 100644 index 0000000000..697eb06f38 --- /dev/null +++ b/python/hsfs/hopsworks_udf.py @@ -0,0 +1,916 @@ +# +# Copyright 2024 Hopsworks AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import ast +import copy +import inspect +import json +import warnings +from dataclasses import dataclass +from datetime import date, datetime, time +from enum import Enum +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import humps +from hsfs import engine, util +from hsfs.client.exceptions import FeatureStoreException +from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics +from hsfs.decorators import typechecked +from hsfs.transformation_statistics import TransformationStatistics + + +class UDFType(Enum): + """ + Class that store the possible types of transformation functions. + """ + + MODEL_DEPENDENT = "model_dependent" + ON_DEMAND = "on_demand" + + +def udf( + return_type: Union[List[type], type], drop: Optional[Union[str, List[str]]] = None +) -> "HopsworksUdf": + """ + Create an User Defined Function that can be and used within the Hopsworks Feature Store. + + Hopsworks UDF's are user defined functions that executes as 'pandas_udf' when executing + in spark engine and as pandas functions in the python engine. The pandas udf/pandas functions + gets as inputs pandas Series's and can provide as output a pandas Series or a pandas DataFrame. + A Hopsworks udf is defined using the `hopsworks_udf` decorator. The outputs of the defined UDF + must be mentioned in the decorator as a list of python types. + + + !!! example + ```python + from hopsworks import udf + + @udf(float) + def add_one(data1 : pd.Series): + return data1 + 1 + ``` + + # Arguments + return_type: `list`. The output types of the defined UDF + drop: `List[str]`. The features to be dropped after application of transformation functions + + # Returns + `HopsworksUdf`: The metadata object for hopsworks UDF's. + + # Raises + `hsfs.client.exceptions.FeatureStoreException` : If unable to create UDF. + """ + + def wrapper(func: Callable) -> HopsworksUdf: + udf = HopsworksUdf( + func=func, return_types=return_type, dropped_argument_names=drop + ) + return udf + + return wrapper + + +@dataclass +class TransformationFeature: + """ + Mapping of feature names to their corresponding statistics argument names in the code. + + The statistic_argument_name for a feature name would be None if the feature does not need statistics. + + Attributes + ---------- + feature_name (str) : Name of the feature. + statistic_argument_name (str) : Name of the statistics argument in the code for the feature specified in the feature name. + """ + + feature_name: str + statistic_argument_name: Optional[str] + + def to_dict(self) -> Dict[str, Any]: + return { + "feature_name": self.feature_name, + "statistic_argument_name": self.statistic_argument_name, + } + + +@typechecked +class HopsworksUdf: + """ + Meta data for user defined functions. + + Stores meta data required to execute the user defined function in both spark and python engine. + The class generates uses the metadata to dynamically generate user defined functions based on the + engine it is executed in. + + Attributes + ---------- + function_name (str) : Name of the UDF + udf_type (UDFType): Type of the UDF can be either \"model dependent\" or \"on-demand\". + return_types (List[str]): The data types of the columns returned from the UDF. + transformation_features (List[str]) : List of feature names to which the transformation function would be applied. + output_column_names (List[str]): Column names of the DataFrame returned after application of the transformation function. + dropped_features (List[str]): List of features that will be dropped after the UDF is applied. + transformation_statistics (Dict[str, FeatureDescriptiveStatistics]): Dictionary that maps the statistics_argument name in the function to the actual statistics variable. + statistics_required (bool) : True if statistics is required for any of the parameters of the UDF. + statistics_features (List[str]) : List of feature names that requires statistics. + """ + + # Mapping for converting python types to spark types - required for creating pandas UDF's. + PYTHON_SPARK_TYPE_MAPPING = { + str: "string", + int: "bigint", + float: "double", + bool: "boolean", + datetime: "timestamp", + time: "timestamp", + date: "date", + } + + def __init__( + self, + func: Union[Callable, str], + return_types: Union[List[type], type, List[str], str], + name: Optional[str] = None, + transformation_features: Optional[List[TransformationFeature]] = None, + transformation_function_argument_names: Optional[ + List[TransformationFeature] + ] = None, + dropped_argument_names: Optional[List[str]] = None, + dropped_feature_names: Optional[List[str]] = None, + feature_name_prefix: Optional[str] = None, + ): + self._return_types: List[str] = HopsworksUdf._validate_and_convert_output_types( + return_types + ) + + self._feature_name_prefix: Optional[str] = ( + feature_name_prefix # Prefix to be added to feature names + ) + + self._function_name: str = func.__name__ if name is None else name + + self._function_source: str = ( + HopsworksUdf._extract_source_code(func) + if isinstance(func, Callable) + else func + ) + if not transformation_features: + # New transformation function being declared so extract source code from function + self._transformation_features: List[TransformationFeature] = ( + HopsworksUdf._extract_function_arguments(func) + if not transformation_features + else transformation_features + ) + + self._transformation_function_argument_names = [ + feature.feature_name for feature in self._transformation_features + ] + + self._dropped_argument_names: List[str] = ( + HopsworksUdf._validate_and_convert_drop_features( + dropped_argument_names, + self.transformation_features, + feature_name_prefix, + ) + ) + self._dropped_features = self._dropped_argument_names + else: + self._transformation_features = transformation_features + self._transformation_function_argument_names = ( + transformation_function_argument_names + ) + self._dropped_argument_names = dropped_argument_names + self._dropped_features = ( + dropped_feature_names + if dropped_feature_names + else dropped_argument_names + ) + + self._formatted_function_source, self._module_imports = ( + HopsworksUdf._format_source_code(self._function_source) + ) + + self._statistics: Optional[TransformationStatistics] = None + + self._udf_type: UDFType = None + + self._output_column_names: List[str] = [] + + @staticmethod + def _validate_and_convert_drop_features( + dropped_features: Union[str, List[str]], + transformation_feature: List[str], + feature_name_prefix: str, + ) -> List[str]: + """ + Function that converts dropped features to a list and validates if the dropped feature is present in the transformation function + # Arguments + dropped_features: `Union[str, List[str]]`. Features of be dropped. + transformation_feature: `List[str]`. Features to be transformed in the UDF + # Returns + `List[str]`: A list of features to be dropped. + """ + if not dropped_features: + return None + + dropped_features = ( + [dropped_features] + if not isinstance(dropped_features, list) + else dropped_features + ) + + feature_name_prefix = feature_name_prefix if feature_name_prefix else "" + + missing_drop_features = [] + for dropped_feature in dropped_features: + dropped_feature = feature_name_prefix + dropped_feature + if dropped_feature not in transformation_feature: + missing_drop_features.append(dropped_feature) + + if missing_drop_features: + missing_drop_features = "', '".join(missing_drop_features) + raise FeatureStoreException( + f"Cannot drop features '{missing_drop_features}' as they are not features given as arguments in the defined UDF." + ) + + return dropped_features + + @staticmethod + def _validate_and_convert_output_types( + output_types: Union[List[type], List[str]], + ) -> List[str]: + """ + Function that takes in a type or list of types validates if it is supported and return a list of strings + + # Arguments + output_types: `list`. List of python types. + + # Raises + `hsfs.client.exceptions.FeatureStoreException` : If the any of the output type is invalid + """ + convert_output_types = [] + output_types = ( + output_types if isinstance(output_types, List) else [output_types] + ) + for output_type in output_types: + if ( + output_type not in HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING.keys() + and output_type not in HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING.values() + ): + raise FeatureStoreException( + f"Output type {output_type} is not supported. Please refer to the documentation to get more information on the supported types." + ) + convert_output_types.append( + output_type + if isinstance(output_type, str) + else HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[output_type] + ) + return convert_output_types + + @staticmethod + def _get_module_imports(path: str) -> List[str]: + """Function that extracts the imports used in the python file specified in the path. + + # Arguments + path: `str`. Path to python file from which imports are to be extracted. + + # Returns + `List[str]`: A list of string that contains the import statement using in the file. + """ + imports = [] + with open(path) as fh: + root = ast.parse(fh.read(), path) + for node in ast.iter_child_nodes(root): + if isinstance(node, ast.Import): + imported_module = False + elif isinstance(node, ast.ImportFrom): + imported_module = node.module + else: + continue + for n in node.names: + if imported_module: + import_line = "from " + imported_module + " import " + n.name + elif n.asname: + import_line = "import " + n.name + " as " + n.asname + else: + import_line = "import " + n.name + imports.append(import_line) + return imports + + @staticmethod + def _extract_source_code(udf_function: Callable) -> str: + """ + Function to extract the source code of the function along with the imports used in the file. + + The module imports cannot be extracted if the function is defined in a jupyter notebook. + + # Arguments + udf_function: `Callable`. Function for which the source code must be extracted. + # Returns + `str`: a string that contains the source code of function along with the extracted module imports. + """ + try: + module_imports = HopsworksUdf._get_module_imports( + inspect.getfile(udf_function) + ) + except FileNotFoundError: + module_imports = [""] + warnings.warn( + "Cannot extract imported dependencies for the function module. Please make sure to import all dependencies for the UDF inside the function.", + stacklevel=2, + ) + + function_code = inspect.getsource(udf_function) + source_code = "\n".join(module_imports) + "\n" + function_code + + return source_code + + @staticmethod + def _parse_function_signature(source_code: str) -> Tuple[List[str], str, int, int]: + """ + Function to parse the source code to extract the argument along with the start and end line of the function signature + + # Arguments + source_code: `str`. Source code of a function. + # Returns + `List[str]`: List of function arguments + `str`: function signature + `int`: starting line number of function signature + `int`: ending line number of function signature + + """ + source_code = source_code.split("\n") + + signature_start_line = None + signature_end_line = None + # Find the line where the function signature is defined + for i, line in enumerate(source_code): + if line.strip().startswith("def "): + signature_start_line = i + if signature_start_line is not None and ")" in line: + signature_end_line = i + break + + # Parse the function signature to remove the specified argument + signature = "".join( + [ + code.split("#")[0] + for code in source_code[signature_start_line : signature_end_line + 1] + ] + ) + arg_list = signature.split("(")[1].split(")")[0].split(",") + arg_list = [ + arg.split(":")[0].split("=")[0].strip() + for arg in arg_list + if not arg.strip() == "" + ] + if "statistics" in arg_list: + arg_list.remove("statistics") + return arg_list, signature, signature_start_line, signature_end_line + + @staticmethod + def _extract_function_arguments(function: Callable) -> List[TransformationFeature]: + """ + Function to extract the argument names from a provided function source code. + + # Arguments + source_code: `Callable`. The function for which the value are to be extracted. + # Returns + `List[TransformationFeature]`: List of TransformationFeature that provide a mapping from feature names to corresponding statistics parameters if any is present. + """ + arg_list = [] + statistics = None + signature = inspect.signature(function).parameters + if not signature: + raise FeatureStoreException( + "No arguments present in the provided user defined function. Please provide at least one argument in the defined user defined function." + ) + for arg in inspect.signature(function).parameters.values(): + if arg.name == "statistics": + statistics = arg.default + else: + arg_list.append(arg.name) + + if statistics: + missing_statistic_features = [ + statistic_feature + for statistic_feature in statistics._features + if statistic_feature not in arg_list + ] + if missing_statistic_features: + missing_statistic_features = "', '".join(missing_statistic_features) + raise FeatureStoreException( + f"No argument corresponding to statistics parameter '{missing_statistic_features}' present in function definition." + ) + return [ + TransformationFeature(arg, arg if arg in statistics._features else None) + for arg in arg_list + ] + else: + return [TransformationFeature(arg, None) for arg in arg_list] + + @staticmethod + def _format_source_code(source_code: str) -> Tuple[str, str]: + """ + Function that parses the existing source code to remove statistics parameter and remove all decorators and type hints from the function source code. + + # Arguments + source_code: `str`. Source code of a function. + # Returns + `Tuple[str, str]`: Tuple that contains Source code that does not contain any decorators, type hints or statistics parameters and the module imports + """ + + arg_list, signature, _, signature_end_line = ( + HopsworksUdf._parse_function_signature(source_code) + ) + module_imports = source_code.split("@")[0] + + # Reconstruct the function signature + new_signature = ( + signature.split("(")[0].strip() + "(" + ", ".join(arg_list) + "):" + ) + source_code = source_code.split("\n") + # Reconstruct the modified function as a string + modified_source = ( + new_signature + "\n\t" + "\n\t".join(source_code[signature_end_line + 1 :]) + ) + + return modified_source, module_imports + + def _get_output_column_names(self) -> str: + """ + Function that generates feature names for the transformed features + + # Returns + `List[str]`: List of feature names for the transformed columns + """ + if self._udf_type == UDFType.MODEL_DEPENDENT: + _BASE_COLUMN_NAME = ( + f'{self.function_name}_{"_".join(self.transformation_features)}_' + ) + if len(self.return_types) > 1: + return [ + f"{_BASE_COLUMN_NAME}{i}" for i in range(len(self.return_types)) + ] + else: + return [f"{_BASE_COLUMN_NAME}"] + elif self._udf_type == UDFType.ON_DEMAND: + return [self.function_name] + + def _create_pandas_udf_return_schema_from_list(self) -> str: + """ + Function that creates the return schema required for executing the defined UDF's as pandas UDF's in Spark. + + # Returns + `str`: DDL-formatted type string that denotes the return types of the user defined function. + """ + if len(self.return_types) > 1: + return ", ".join( + [ + f"`{self.output_column_names[i]}` {self.return_types[i]}" + for i in range(len(self.return_types)) + ] + ) + else: + return self.return_types[0] + + def hopsworksUdf_wrapper(self) -> Callable: + """ + Function that creates a dynamic wrapper function for the defined udf that renames the columns output by the UDF into specified column names. + + The renames is done so that the column names match the schema expected by spark when multiple columns are returned in a pandas udf. + The wrapper function would be available in the main scope of the program. + + # Returns + `Callable`: A wrapper function that renames outputs of the User defined function into specified output column names. + """ + + # Function to make transformation function time safe. Defined as a string because it has to be dynamically injected into scope to be executed by spark + convert_timstamp_function = """def convert_timezone(date_time_col : pd.Series): + import tzlocal + current_timezone = tzlocal.get_localzone() + if date_time_col.dt.tz is None: + # if timestamp is timezone unaware, make sure it's localized to the system's timezone. + # otherwise, spark will implicitly convert it to the system's timezone. + return date_time_col.dt.tz_localize(str(current_timezone)) + else: + # convert to utc, then localize to system's timezone + return date_time_col.dt.tz_localize(None).dt.tz_localize(str(current_timezone))""" + + # Defining wrapper function that renames the column names to specific names + if len(self.return_types) > 1: + code = ( + self._module_imports + + "\n" + + f"""import pandas as pd +{convert_timstamp_function} +def renaming_wrapper(*args): + {self._formatted_function_source} + df = {self.function_name}(*args) + df = df.rename(columns = {{df.columns[i]: _output_col_names[i] for i in range(len(df.columns))}}) + for col in df: + if pd.api.types.is_datetime64_any_dtype(df[col]): + df[col] = convert_timezone(df[col]) + return df""" + ) + else: + code = ( + self._module_imports + + "\n" + + f"""import pandas as pd +{convert_timstamp_function} +def renaming_wrapper(*args): + {self._formatted_function_source} + df = {self.function_name}(*args) + df = df.rename(_output_col_names[0]) + if pd.api.types.is_datetime64_any_dtype(df): + df = convert_timezone(df) + return df""" + ) + + # injecting variables into scope used to execute wrapper function. + + # Shallow copy of scope performed because updating statistics argument of scope must not affect other instances. + scope = __import__("__main__").__dict__.copy() + if self.transformation_statistics is not None: + scope.update({"statistics": self.transformation_statistics}) + scope.update({"_output_col_names": self.output_column_names}) + # executing code + exec(code, scope) + + # returning executed function object + return eval("renaming_wrapper", scope) + + def __call__(self, *features: List[str]) -> "HopsworksUdf": + """ + Set features to be passed as arguments to the user defined functions + + # Arguments + features: Name of features to be passed to the User Defined function + # Returns + `HopsworksUdf`: Meta data class for the user defined function. + # Raises + `FeatureStoreException: If the provided number of features do not match the number of arguments in the defined UDF or if the provided feature names are not strings. + """ + + if len(features) != len(self.transformation_features): + raise FeatureStoreException( + "Number of features provided does not match the number of features provided in the UDF definition" + ) + + for arg in features: + if not isinstance(arg, str): + raise FeatureStoreException( + f'Feature names provided must be string "{arg}" is not string' + ) + transformation_feature_name = self.transformation_features + if self.dropped_features: + index_dropped_features = [ + transformation_feature_name.index(dropped_feature) + for dropped_feature in self.dropped_features + ] + updated_dropped_features = [ + features[index] for index in index_dropped_features + ] + else: + updated_dropped_features = None + + # Create a copy of the UDF to associate it with new feature names. + udf = copy.deepcopy(self) + + udf._transformation_features = [ + TransformationFeature( + new_feature_name, transformation_feature.statistic_argument_name + ) + for transformation_feature, new_feature_name in zip( + self._transformation_features, features + ) + ] + udf.output_column_names = udf._get_output_column_names() + udf.dropped_features = updated_dropped_features + return udf + + def update_return_type_one_hot(self): + self._return_types = [ + self._return_types[0] + for _ in range( + len( + self.transformation_statistics[ + "statistics_feature" + ].extended_statistics["unique_values"] + ) + ) + ] + self.output_column_names = self._get_output_column_names() + + def get_udf(self, force_python_udf: bool = False) -> Callable: + """ + Function that checks the current engine type and returns the appropriate UDF. + + In the spark engine the UDF is returned as a pandas UDF. + While in the python engine the UDF is returned as python function. + + # Arguments + force_python_udf: `bool`. Force return a python compatible udf irrespective of engine. + + # Returns + `Callable`: Pandas UDF in the spark engine otherwise returns a python function for the UDF. + """ + if self.udf_type is None: + raise FeatureStoreException("UDF Type cannot be None") + + if engine.get_type() in ["hive", "python", "training"] or force_python_udf: + return self.hopsworksUdf_wrapper() + else: + from pyspark.sql.functions import pandas_udf + + return pandas_udf( + f=self.hopsworksUdf_wrapper(), + returnType=self._create_pandas_udf_return_schema_from_list(), + ) + + def to_dict(self) -> Dict[str, Any]: + """ + Convert class into a dictionary. + + # Returns + `Dict`: Dictionary that contains all data required to json serialize the object. + """ + return { + "sourceCode": self._function_source, + "outputTypes": self.return_types, + "transformationFeatures": self.transformation_features, + "transformationFunctionArgumentNames": self._transformation_function_argument_names, + "droppedArgumentNames": self._dropped_argument_names, + "statisticsArgumentNames": self._statistics_argument_names + if self.statistics_required + else None, + "name": self._function_name, + "featureNamePrefix": self._feature_name_prefix, + } + + def json(self) -> str: + """ + Convert class into its json serialized form. + + # Returns + `str`: Json serialized object. + """ + return json.dumps(self, cls=util.FeatureStoreEncoder) + + @classmethod + def from_response_json( + cls: "HopsworksUdf", json_dict: Dict[str, Any] + ) -> "HopsworksUdf": + """ + Function that constructs the class object from its json serialization. + + # Arguments + json_dict: `Dict[str, Any]`. Json serialized dictionary for the class. + # Returns + `HopsworksUdf`: Json deserialized class object. + """ + + json_decamelized = humps.decamelize(json_dict) + function_source_code = json_decamelized["source_code"] + function_name = json_decamelized["name"] + feature_name_prefix = json_decamelized.get("feature_name_prefix", None) + output_types = [ + output_type.strip() for output_type in json_decamelized["output_types"] + ] + transformation_features = [ + feature.strip() for feature in json_decamelized["transformation_features"] + ] + dropped_argument_names = ( + [ + dropped_feature.strip() + for dropped_feature in json_decamelized["dropped_argument_names"] + ] + if "dropped_argument_names" in json_decamelized + else None + ) + statistics_features = ( + [ + feature.strip() + for feature in json_decamelized["statistics_argument_names"] + ] + if "statistics_argument_names" in json_decamelized + else None + ) + + # Reconstructing statistics arguments. + arg_list, _, _, _ = HopsworksUdf._parse_function_signature(function_source_code) + + transformation_features = ( + arg_list if not transformation_features else transformation_features + ) + + dropped_feature_names = ( + [ + transformation_features[arg_list.index(dropped_argument_name)] + for dropped_argument_name in dropped_argument_names + ] + if dropped_argument_names + else None + ) + + if statistics_features: + transformation_features = [ + TransformationFeature( + transformation_features[arg_index], + arg_list[arg_index] + if arg_list[arg_index] in statistics_features + else None, + ) + for arg_index in range(len(arg_list)) + ] + else: + transformation_features = [ + TransformationFeature(transformation_features[arg_index], None) + for arg_index in range(len(arg_list)) + ] + + hopsworks_udf = cls( + func=function_source_code, + return_types=output_types, + name=function_name, + transformation_features=transformation_features, + dropped_argument_names=dropped_argument_names, + dropped_feature_names=dropped_feature_names, + feature_name_prefix=feature_name_prefix, + ) + + # Set transformation features if already set. + return hopsworks_udf + + def _validate_udf_type(self): + """ + Function that returns validates if the defined transformation function can be used for the specified UDF type. + + # Raises + `hsfs.client.exceptions.FeatureStoreException` : If the UDF Type is None or if statistics or multiple columns has been output by a on-demand transformation function + """ + + if self._udf_type == UDFType.ON_DEMAND: + if len(self.return_types) > 1: + raise FeatureStoreException( + "On-Demand Transformation functions can only return one column as output" + ) + + if self.statistics_required: + raise FeatureStoreException( + "On-Demand Transformation functions cannot use statistics, please remove statistics parameters from the functions" + ) + + @property + def return_types(self) -> List[str]: + """Get the output types of the UDF""" + # Update the number of outputs for one hot encoder to match the number of unique values for the feature + if self.function_name == "one_hot_encoder" and self.transformation_statistics: + self.update_return_type_one_hot() + return self._return_types + + @property + def function_name(self) -> str: + """Get the function name of the UDF""" + return self._function_name + + @property + def statistics_required(self) -> bool: + """Get if statistics for any feature is required by the UDF""" + return bool(self.statistics_features) + + @property + def transformation_statistics( + self, + ) -> Optional[TransformationStatistics]: + """Feature statistics required for the defined UDF""" + return self._statistics + + @property + def output_column_names(self) -> List[str]: + """Output columns names of the transformation function""" + if self._feature_name_prefix: + return [ + self._feature_name_prefix + output_col_name + for output_col_name in self._output_column_names + ] + else: + return self._output_column_names + + @property + def transformation_features(self) -> List[str]: + """ + List of feature names to be used in the User Defined Function. + """ + if self._feature_name_prefix: + return [ + self._feature_name_prefix + transformation_feature.feature_name + for transformation_feature in self._transformation_features + ] + + else: + return [ + transformation_feature.feature_name + for transformation_feature in self._transformation_features + ] + + @property + def statistics_features(self) -> List[str]: + """ + List of feature names that require statistics + """ + return [ + transformation_feature.feature_name + for transformation_feature in self._transformation_features + if transformation_feature.statistic_argument_name is not None + ] + + @property + def _statistics_argument_mapping(self) -> Dict[str, str]: + """ + Dictionary that maps feature names to the statistics arguments names in the User defined function. + """ + return { + transformation_feature.feature_name: transformation_feature.statistic_argument_name + for transformation_feature in self._transformation_features + } + + @property + def _statistics_argument_names(self) -> List[str]: + """ + List of argument names required for statistics + """ + return [ + transformation_feature.statistic_argument_name + for transformation_feature in self._transformation_features + if transformation_feature.statistic_argument_name is not None + ] + + @property + def udf_type(self) -> UDFType: + """Type of the UDF : Can be \"model dependent\" or \"on-demand\" """ + return self._udf_type + + @udf_type.setter + def udf_type(self, udf_type: UDFType) -> None: + self._udf_type = udf_type + self._validate_udf_type() + self._output_column_names = self._get_output_column_names() + + @property + def dropped_features(self) -> List[str]: + """ + List of features that will be dropped after the UDF is applied. + """ + if self._feature_name_prefix and self._dropped_features: + return [ + self._feature_name_prefix + dropped_feature + for dropped_feature in self._dropped_features + ] + else: + return self._dropped_features + + @dropped_features.setter + def dropped_features(self, features: List[str]) -> None: + self._dropped_features = HopsworksUdf._validate_and_convert_drop_features( + features, self.transformation_features, self._feature_name_prefix + ) + + @transformation_statistics.setter + def transformation_statistics( + self, statistics: List[FeatureDescriptiveStatistics] + ) -> None: + self._statistics = TransformationStatistics(*self._statistics_argument_names) + for stat in statistics: + if stat.feature_name in self._statistics_argument_mapping.keys(): + self._statistics.set_statistics( + self._statistics_argument_mapping[stat.feature_name], stat.to_dict() + ) + + @output_column_names.setter + def output_column_names(self, output_col_names: Union[str, List[str]]) -> None: + if not isinstance(output_col_names, List): + output_col_names = [output_col_names] + if len(output_col_names) != len(self.return_types): + raise FeatureStoreException( + f"Provided names for output columns does not match the number of columns returned from the UDF. Please provide {len(self.return_types)} names." + ) + else: + self._output_column_names = output_col_names + + def __repr__(self): + return f'{self.function_name}({", ".join(self.transformation_features)})' diff --git a/python/hsfs/training_dataset.py b/python/hsfs/training_dataset.py index 5f51044546..f19b95e037 100644 --- a/python/hsfs/training_dataset.py +++ b/python/hsfs/training_dataset.py @@ -29,7 +29,6 @@ statistics_engine, training_dataset_api, training_dataset_engine, - transformation_function_engine, vector_server, ) from hsfs.statistics_config import StatisticsConfig @@ -538,7 +537,6 @@ def __init__( from_query=None, querydto=None, label=None, - transformation_functions=None, train_split=None, time_split_size=None, extra_filter=None, @@ -580,7 +578,6 @@ def __init__( self._querydto = querydto self._feature_store_id = featurestore_id self._feature_store_name = featurestore_name - self._transformation_functions = transformation_functions self._training_dataset_api = training_dataset_api.TrainingDatasetApi( featurestore_id @@ -592,9 +589,6 @@ def __init__( featurestore_id, self.ENTITY_TYPE ) self._code_engine = code_engine.CodeEngine(featurestore_id, self.ENTITY_TYPE) - self._transformation_function_engine = ( - transformation_function_engine.TransformationFunctionEngine(featurestore_id) - ) self._vector_server = vector_server.VectorServer( featurestore_id, features=self._features ) @@ -1084,19 +1078,6 @@ def feature_store_name(self) -> str: """Name of the feature store in which the feature group is located.""" return self._feature_store_name - @property - def transformation_functions(self): - """Set transformation functions.""" - if self._id is not None and self._transformation_functions is None: - self._transformation_functions = ( - self._transformation_function_engine.get_td_transformation_fn(self) - ) - return self._transformation_functions - - @transformation_functions.setter - def transformation_functions(self, transformation_functions): - self._transformation_functions = transformation_functions - @property def serving_keys(self) -> Set[str]: """Set of primary key names that is used as keys in input dict object for `get_serving_vector` method.""" diff --git a/python/hsfs/training_dataset_feature.py b/python/hsfs/training_dataset_feature.py index 6c3a04ea3d..3aa3f6a81f 100644 --- a/python/hsfs/training_dataset_feature.py +++ b/python/hsfs/training_dataset_feature.py @@ -15,11 +15,14 @@ # from __future__ import annotations +from typing import Optional + import humps from hsfs import feature as feature_mod from hsfs import feature_group as feature_group_mod -from hsfs import transformation_function as tf_mod from hsfs import util +from hsfs.hopsworks_udf import UDFType +from hsfs.transformation_function import TransformationFunction class TrainingDatasetFeature: @@ -33,7 +36,7 @@ def __init__( label=False, inference_helper_column=False, training_helper_column=False, - transformation_function=None, + transformation_function: Optional[TransformationFunction] = None, **kwargs, ): self._name = util.autofix_feature_name(name) @@ -48,10 +51,9 @@ def __init__( self._label = label self._inference_helper_column = inference_helper_column self._training_helper_column = training_helper_column - self._transformation_function = ( - tf_mod.TransformationFunction.from_response_json(transformation_function) - if isinstance(transformation_function, dict) - else transformation_function + + self._on_demand_transformation_function: Optional[TransformationFunction] = ( + transformation_function if transformation_function else None ) def to_dict(self): @@ -62,14 +64,23 @@ def to_dict(self): "label": self._label, "inferenceHelperColumn": self._inference_helper_column, "trainingHelperColumn": self._training_helper_column, - "transformationFunction": self._transformation_function, "featureGroupFeatureName": self._feature_group_feature_name, "featuregroup": self._feature_group, + "transformation_function": self._on_demand_transformation_function, } @classmethod def from_response_json(cls, json_dict): json_decamelized = humps.decamelize(json_dict) + if json_decamelized.get("transformation_function", False): + json_decamelized["transformation_function"]["transformation_type"] = ( + UDFType.ON_DEMAND + ) + json_decamelized["transformation_function"] = ( + TransformationFunction.from_response_json( + json_decamelized.get("transformation_function") + ) + ) return cls(**json_decamelized) def is_complex(self): @@ -118,6 +129,11 @@ def inference_helper_column(self): def inference_helper_column(self, inference_helper_column): self._inference_helper_column = inference_helper_column + @property + def on_demand_transformation_function(self) -> TransformationFunction: + """Whether the feature is a on-demand feature computed using on-demand transformation functions""" + return self._on_demand_transformation_function + @property def training_helper_column(self): """Indicator if it is feature.""" @@ -127,15 +143,6 @@ def training_helper_column(self): def training_helper_column(self, training_helper_column): self._training_helper_column = training_helper_column - @property - def transformation_function(self): - """Set transformation functions.""" - return self._transformation_function - - @transformation_function.setter - def transformation_function(self, transformation_function): - self._transformation_function = transformation_function - @property def feature_group(self): return self._feature_group @@ -145,4 +152,4 @@ def feature_group_feature_name(self): return self._feature_group_feature_name def __repr__(self): - return f"Training Dataset Feature({self._name!r}, {self._type!r}, {self._index!r}, {self._label}, {self._transformation_function}, {self._feature_group_feature_name}, {self._feature_group.id!r})" + return f"Training Dataset Feature({self._name!r}, {self._type!r}, {self._index!r}, {self._label}, {self._feature_group_feature_name}, {self._feature_group.id!r}, {self.on_demand_transformation_function})" diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py index ffd88fd502..fe30047384 100644 --- a/python/hsfs/transformation_function.py +++ b/python/hsfs/transformation_function.py @@ -14,88 +14,75 @@ # from __future__ import annotations -import ast -import inspect +import copy import json +from typing import Any, Dict, List, Optional, Union import humps from hsfs import util +from hsfs.client.exceptions import FeatureStoreException from hsfs.core import transformation_function_engine +from hsfs.decorators import typechecked +from hsfs.hopsworks_udf import HopsworksUdf, UDFType +@typechecked class TransformationFunction: + """ + Main DTO class for transformation functions. + + Attributes + ---------- + id (int) : Id of transformation function. + version (int) : Version of transformation function. + hopsworks_udf (HopsworksUdf): Meta data class for user defined functions. + """ + def __init__( self, - featurestore_id, - transformation_fn=None, - version=None, - name=None, - source_code_content=None, - builtin_source_code=None, - output_type=None, - id=None, + featurestore_id: int, + hopsworks_udf: HopsworksUdf, + version: Optional[int] = None, + id: Optional[int] = None, + transformation_type: Optional[UDFType] = None, type=None, items=None, count=None, href=None, **kwargs, ): - self._id = id - self._featurestore_id = featurestore_id - self._version = version - self._name = name - self._transformation_fn = transformation_fn - self._source_code_content = source_code_content + self._id: int = id + self._featurestore_id: int = featurestore_id + self._version: int = version self._transformation_function_engine = ( transformation_function_engine.TransformationFunctionEngine( self._featurestore_id ) ) - - # set up depending on user initialized - if self._transformation_fn is not None: - # type -> user init coming from user - self._transformer_code = None - self._extract_source_code() - self._output_type = self._transformation_function_engine.infer_spark_type( - output_type - ) - elif builtin_source_code is not None: - # user triggered to register built-in transformation function - self._output_type = self._transformation_function_engine.infer_spark_type( - output_type - ) - self._source_code_content = json.dumps( - { - "module_imports": "", - "transformer_code": builtin_source_code, - } + if not isinstance(hopsworks_udf, HopsworksUdf): + raise FeatureStoreException( + "Please use the hopsworks_udf decorator when defining transformation functions." ) - else: - # load backend response - # load original source code - self._output_type = self._transformation_function_engine.infer_spark_type( - output_type - ) - self._load_source_code(self._source_code_content) - self._feature_group_feature_name = None - self._feature_group_id = None + self._hopsworks_udf: HopsworksUdf = hopsworks_udf + self._hopsworks_udf.udf_type = transformation_type - def save(self): - """Persist transformation function in backend. + def save(self) -> None: + """Save a transformation function into the backend. !!! example ```python + # import hopsworks udf decorator + from hsfs.hopsworks_udf import HopsworksUdf # define function + @udf(int) def plus_one(value): return value + 1 # create transformation function plus_one_meta = fs.create_transformation_function( transformation_function=plus_one, - output_type=int, version=1 ) @@ -105,19 +92,21 @@ def plus_one(value): """ self._transformation_function_engine.save(self) - def delete(self): + def delete(self) -> None: """Delete transformation function from backend. !!! example ```python + # import hopsworks udf decorator + from hsfs.hopsworks_udf import HopsworksUdf # define function + @udf(int) def plus_one(value): return value + 1 # create transformation function plus_one_meta = fs.create_transformation_function( transformation_function=plus_one, - output_type=int, version=1 ) # persist transformation function in backend @@ -132,158 +121,127 @@ def plus_one(value): """ self._transformation_function_engine.delete(self) - def _extract_source_code(self): - if not callable(self._transformation_fn): - raise ValueError("transformer must be callable") - - self._name = self._transformation_fn.__name__ - - transformer_code = inspect.getsource(self._transformation_fn) - - module_imports = self._get_module_imports( - self._get_module_path(self._transformation_fn.__module__) - ) - - self._transformer_code = "\n".join(module_imports) + "\n" + transformer_code - - # initialise source code dict - # add all imports from module - # add original source code that will be used during offline transformations - self._source_code_content = json.dumps( - { - "module_imports": "\n".join(module_imports), - "transformer_code": transformer_code, - } - ) - - @staticmethod - def _get_module_path(module_name): - def _get_module_path(module): - return module.__file__ - - module_path = {} - exec( - """import %s\nmodule_path["path"] = _get_module_path(%s)""" - % (module_name, module_name) - ) - return module_path["path"] - - @staticmethod - def _get_module_imports(path): - imports = [] - with open(path) as fh: - root = ast.parse(fh.read(), path) - - for node in ast.iter_child_nodes(root): - if isinstance(node, ast.Import): - imported_module = False - elif isinstance(node, ast.ImportFrom): - imported_module = node.module - else: - continue - - for n in node.names: - if imported_module: - import_line = "from " + imported_module + " import " + n.name - elif n.asname: - import_line = "import " + n.name + " as " + n.asname - else: - import_line = "import " + n.name - imports.append(import_line) - return imports - - def _load_source_code(self, source_code_content): - source_code_content = json.loads(source_code_content) - module_imports = source_code_content["module_imports"] - transformer_code = source_code_content["transformer_code"] - self._transformer_code = module_imports + "\n" * 2 + transformer_code - - scope = __import__("__main__").__dict__ - exec(self._transformer_code, scope) - self._transformation_fn = eval(self._name, scope) - self._transformation_fn._code = self._transformer_code + def __call__(self, *features: List[str]) -> TransformationFunction: + """ + Update the feature to be using in the transformation function + + # Arguments + features: `List[str]`. Name of features to be passed to the User Defined function + # Returns + `HopsworksUdf`: Meta data class for the user defined function. + # Raises + `FeatureStoreException: If the provided number of features do not match the number of arguments in the defined UDF or if the provided feature names are not strings. + """ + # Deep copy so that the same transformation function can be used to create multiple new transformation function with different features. + transformation = copy.deepcopy(self) + transformation._hopsworks_udf = transformation._hopsworks_udf(*features) + return transformation @classmethod - def from_response_json(cls, json_dict): + def from_response_json( + cls, json_dict: Dict[str, Any] + ) -> Union[TransformationFunction, List[TransformationFunction]]: + """ + Function that constructs the class object from its json serialization. + + # Arguments + json_dict: `Dict[str, Any]`. Json serialized dictionary for the class. + # Returns + `TransformationFunction`: Json deserialized class object. + """ json_decamelized = humps.decamelize(json_dict) + if "count" in json_decamelized: if json_decamelized["count"] == 0: return [] - return [cls(**tffn_dto) for tffn_dto in json_decamelized["items"]] + for tffn_dto in json_decamelized["items"]: + if tffn_dto.get("hopsworks_udf", False): + tffn_dto["hopsworks_udf"] = HopsworksUdf.from_response_json( + tffn_dto["hopsworks_udf"] + ) + if json_decamelized["count"] == 1: + return cls(**json_decamelized["items"][0]) + else: + return [cls(**tffn_dto) for tffn_dto in json_decamelized["items"]] else: + if json_decamelized.get("hopsworks_udf", False): + json_decamelized["hopsworks_udf"] = HopsworksUdf.from_response_json( + json_decamelized["hopsworks_udf"] + ) return cls(**json_decamelized) - def update_from_response_json(self, json_dict): + def update_from_response_json( + self, json_dict: Dict[str, Any] + ) -> TransformationFunction: + """ + Function that updates the class object from its json serialization. + + # Arguments + json_dict: `Dict[str, Any]`. Json serialized dictionary for the class. + # Returns + `TransformationFunction`: Json deserialized class object. + """ json_decamelized = humps.decamelize(json_dict) self.__init__(**json_decamelized) return self - def json(self): + def json(self) -> str: + """ + Convert class into its json serialized form. + + # Returns + `str`: Json serialized object. + """ return json.dumps(self, cls=util.FeatureStoreEncoder) - def to_dict(self): + def to_dict(self) -> Dict[str, Any]: + """ + Convert class into a dictionary. + + # Returns + `Dict`: Dictionary that contains all data required to json serialize the object. + """ return { "id": self._id, - "name": self._name, "version": self._version, - "sourceCodeContent": self._source_code_content, - "outputType": self._output_type, "featurestoreId": self._featurestore_id, + "hopsworksUdf": self._hopsworks_udf, } @property - def id(self): - """Training dataset id.""" + def id(self) -> id: + """Transformation function id.""" return self._id @id.setter - def id(self, id): + def id(self, id: int) -> None: self._id = id @property - def name(self): - return self._name - - @property - def version(self): + def version(self) -> int: + """Version of the transformation function.""" return self._version - @property - def transformer_code(self): - return self._transformer_code - - @property - def transformation_fn(self): - return self._transformation_fn - - @property - def source_code_content(self): - return self._source_code_content - - @property - def output_type(self): - return self._output_type - - @name.setter - def name(self, name): - self._name = name - @version.setter - def version(self, version): + def version(self, version: int) -> None: self._version = version - @transformer_code.setter - def transformer_code(self, transformer_code): - self._transformer_code = transformer_code - - @transformation_fn.setter - def transformation_fn(self, transformation_fn): - self._transformation_fn = transformation_fn - - @source_code_content.setter - def source_code_content(self, source_code_content): - self._source_code_content = source_code_content + @property + def hopsworks_udf(self) -> HopsworksUdf: + """Meta data class for the user defined transformation function.""" + return self._hopsworks_udf - @output_type.setter - def output_type(self, output_type): - self._output_type = output_type + @property + def output_column_names(self) -> List[str]: + """Output column names of transformation functions""" + return self._hopsworks_udf._output_column_names + + def __repr__(self): + if self.hopsworks_udf._udf_type == UDFType.MODEL_DEPENDENT: + return ( + f"Model-Dependent Transformation Function : {repr(self.hopsworks_udf)}" + ) + elif self.hopsworks_udf._udf_type == UDFType.ON_DEMAND: + return f"On-Demand Transformation Function : {repr(self.hopsworks_udf)}" + else: + return f"Transformation Function : {repr(self.hopsworks_udf)}" diff --git a/python/hsfs/transformation_function_attached.py b/python/hsfs/transformation_function_attached.py deleted file mode 100644 index ca4deceddb..0000000000 --- a/python/hsfs/transformation_function_attached.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright 2021. Logical Clocks AB -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from __future__ import annotations - -import humps -from hsfs import transformation_function as transformation_fn - - -class TransformationFunctionAttached: - def __init__( - self, - name, - transformation_function, - type=None, - items=None, - count=None, - href=None, - **kwargs, - ): - self._name = name - self._transformation_function = ( - transformation_fn.TransformationFunction.from_response_json( - transformation_function - ) - if isinstance(transformation_function, dict) - else transformation_function - ) - - @classmethod - def from_response_json(cls, json_dict): - json_decamelized = humps.decamelize(json_dict) - if "count" in json_decamelized: - if json_decamelized["count"] == 0: - return [] - return [cls(**tffn_dto) for tffn_dto in json_decamelized["items"]] - else: - return cls(**json_decamelized) - - def update_from_response_json(self, json_dict): - json_decamelized = humps.decamelize(json_dict) - self.__init__(**json_decamelized) - return self - - @property - def name(self): - """Set feature name.""" - return self._name - - @name.setter - def name(self, name): - self._name = name - - @property - def transformation_function(self): - """Set transformation functions.""" - return self._transformation_function - - @transformation_function.setter - def transformation_function(self, transformation_function): - self._transformation_function = transformation_function diff --git a/python/hsfs/transformation_statistics.py b/python/hsfs/transformation_statistics.py new file mode 100644 index 0000000000..c4a1bc20b1 --- /dev/null +++ b/python/hsfs/transformation_statistics.py @@ -0,0 +1,124 @@ +# +# Copyright 2024 Hopsworks AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import annotations + +import json +from dataclasses import dataclass +from typing import Any, Dict, Mapping, Optional, Union + +import humps + + +@dataclass +class FeatureTransformationStatistics: + """ + Data class that contains all the statistics parameters that can be used for transformations. + """ + + feature_name: str + count: int = None + # for any feature type + completeness: Optional[float] = None + num_non_null_values: Optional[int] = None + num_null_values: Optional[int] = None + approx_num_distinct_values: Optional[int] = None + # for numerical features + min: Optional[float] = None + max: Optional[float] = None + sum: Optional[float] = None + mean: Optional[float] = None + stddev: Optional[float] = None + percentiles: Optional[Mapping[str, float]] = None + # with exact uniqueness + distinctness: Optional[float] = None + entropy: Optional[float] = None + uniqueness: Optional[float] = None + exact_num_distinct_values: Optional[int] = None + extended_statistics: Optional[Union[dict, str]] = None + + def __init__( + self, + feature_name: str, + count: int = None, + completeness: Optional[float] = None, + num_non_null_values: Optional[int] = None, + num_null_values: Optional[int] = None, + approx_num_distinct_values: Optional[int] = None, + min: Optional[float] = None, + max: Optional[float] = None, + sum: Optional[float] = None, + mean: Optional[float] = None, + stddev: Optional[float] = None, + percentiles: Optional[Mapping[str, float]] = None, + distinctness: Optional[float] = None, + entropy: Optional[float] = None, + uniqueness: Optional[float] = None, + exact_num_distinct_values: Optional[int] = None, + extended_statistics: Optional[Union[dict, str]] = None, + **kwargs, + ): + self.feature_name = feature_name + self.count = count + self.completeness = completeness + self.num_non_null_values = num_non_null_values + self.num_null_values = num_null_values + self.approx_num_distinct_values = approx_num_distinct_values + self.min = min + self.max = max + self.sum = sum + self.mean = mean + self.stddev = stddev + self.percentiles = percentiles + self.distinctness = distinctness + self.entropy = entropy + self.uniqueness = uniqueness + self.exact_num_distinct_values = exact_num_distinct_values + self.extended_statistics = ( + extended_statistics + if not isinstance(extended_statistics, str) + else json.loads(extended_statistics) + ) + + @classmethod + def from_response_json( + cls: FeatureTransformationStatistics, json_dict: Dict[str, Any] + ) -> FeatureTransformationStatistics: + json_decamelized = humps.decamelize(json_dict) + return cls(**json_decamelized) + + +class TransformationStatistics: + """ + Class that stores statistics of all features required for a transformation function. + """ + + def __init__(self, *features: str): + self._features = features + self.__dict__.update( + {feature: self.init_statistics(feature) for feature in features} + ) + + def init_statistics(self, feature_name: str) -> FeatureTransformationStatistics: + return FeatureTransformationStatistics(feature_name=feature_name) + + def set_statistics(self, feature_name: str, statistics: Dict[str, Any]) -> None: + self.__dict__[feature_name] = ( + FeatureTransformationStatistics.from_response_json(statistics) + ) + + def __repr__(self) -> str: + return ",\n ".join([repr(self.__dict__[feature]) for feature in self._features]) diff --git a/python/pyproject.toml b/python/pyproject.toml index 2b3d69db4d..77fe01a61f 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -142,7 +142,7 @@ exclude = [ "node_modules", "site-packages", "venv", - "java", + "java" ] # Same as Black. diff --git a/python/tests/core/test_arrow_flight_client.py b/python/tests/core/test_arrow_flight_client.py index 0b647aedf1..faa480c6ad 100644 --- a/python/tests/core/test_arrow_flight_client.py +++ b/python/tests/core/test_arrow_flight_client.py @@ -77,9 +77,6 @@ def _arrange_featureview_mocks(self, mocker, backend_fixtures): "hsfs.core.feature_view_engine.FeatureViewEngine.get_batch_query", return_value=fg.select_all(), ) - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions" - ) mocker.patch("hsfs.engine.python.Engine._apply_transformation_function") # required for batch query diff --git a/python/tests/core/test_feature_view_engine.py b/python/tests/core/test_feature_view_engine.py index d8410aa21e..f1c3f7ab3d 100644 --- a/python/tests/core/test_feature_view_engine.py +++ b/python/tests/core/test_feature_view_engine.py @@ -23,7 +23,6 @@ feature_view, split_statistics, training_dataset, - transformation_function_attached, ) from hsfs.client.exceptions import FeatureStoreException from hsfs.constructor import fs_query @@ -95,9 +94,6 @@ def test_save(self, mocker): "hsfs.core.feature_view_engine.FeatureViewEngine._get_feature_view_url", return_value=feature_view_url, ) - mock_attach_transformation = mocker.patch( - "hsfs.core.feature_view_engine.FeatureViewEngine.attach_transformation_function", - ) mock_print = mocker.patch("builtins.print") fv_engine = feature_view_engine.FeatureViewEngine( @@ -113,7 +109,6 @@ def test_save(self, mocker): # Assert assert mock_fv_api.return_value.post.call_count == 1 - assert mock_attach_transformation.call_count == 1 assert mock_print.call_count == 1 assert mock_print.call_args[0][ 0 @@ -352,12 +347,6 @@ def test_get_name(self, mocker): feature_store_id = 99 mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi") - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.get_fv_attached_transformation_fn" - ) - mock_attach_transformation = mocker.patch( - "hsfs.core.feature_view_engine.FeatureViewEngine.attach_transformation_function", - ) fv_engine = feature_view_engine.FeatureViewEngine( feature_store_id=feature_store_id @@ -385,7 +374,6 @@ def test_get_name(self, mocker): # Assert assert mock_fv_api.return_value.get_by_name_version.call_count == 0 - assert mock_attach_transformation.call_count == 2 assert mock_fv_api.return_value.get_by_name.call_count == 1 assert len(result) == 2 @@ -394,12 +382,6 @@ def test_get_name_version(self, mocker): feature_store_id = 99 mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi") - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.get_fv_attached_transformation_fn" - ) - mock_attach_transformation = mocker.patch( - "hsfs.core.feature_view_engine.FeatureViewEngine.attach_transformation_function", - ) fv_engine = feature_view_engine.FeatureViewEngine( feature_store_id=feature_store_id @@ -420,7 +402,6 @@ def test_get_name_version(self, mocker): # Assert assert mock_fv_api.return_value.get_by_name_version.call_count == 1 - assert mock_attach_transformation.call_count == 1 assert mock_fv_api.return_value.get_by_name.call_count == 0 def test_delete_name(self, mocker): @@ -566,41 +547,6 @@ def test_get_batch_query_string_pit_query(self, mocker): assert mock_fv_api.return_value.get_batch_query.call_count == 1 assert mock_qc_api.return_value.construct_query.call_count == 1 - def test_attach_transformation_function(self, mocker): - def testFunction(): - print("Test") - - tf = transformation_function_attached.TransformationFunctionAttached( - name="tf_name", transformation_function=testFunction - ) - mocker.patch("hsfs.core.feature_view_api.FeatureViewApi") - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.get_fv_attached_transformation_fn", - return_value={"label": tf}, - ) - feature_store_id = 99 - fv_engine = feature_view_engine.FeatureViewEngine( - feature_store_id=feature_store_id - ) - fv = feature_view.FeatureView( - name="fv_name", - version=1, - query=query, - featurestore_id=feature_store_id, - ) - fv.schema = query.features - - # Act - fv_engine.attach_transformation_function(fv) - - # Assert - id_feature = fv.schema[0] - label_feature = fv.schema[1] - assert id_feature.name == "id" - assert id_feature.transformation_function is None - assert label_feature.name == "label" - assert label_feature.transformation_function == tf - def test_create_training_dataset(self, mocker): # Arrange feature_store_id = 99 @@ -1619,7 +1565,6 @@ def test_get_training_dataset_metadata(self, mocker): # Assert assert mock_fv_api.return_value.get_training_dataset_by_version.call_count == 1 assert result.schema == fv.schema - assert result.transformation_functions == fv.transformation_functions def test_create_training_data_metadata(self, mocker): # Arrange diff --git a/python/tests/core/test_training_dataset_engine.py b/python/tests/core/test_training_dataset_engine.py index a1e28c49ae..c1a55ca00a 100644 --- a/python/tests/core/test_training_dataset_engine.py +++ b/python/tests/core/test_training_dataset_engine.py @@ -14,12 +14,10 @@ # limitations under the License. # -import pytest from hsfs import ( feature_group, training_dataset, training_dataset_feature, - transformation_function, ) from hsfs.constructor import query from hsfs.core import training_dataset_engine @@ -31,9 +29,6 @@ def test_save(self, mocker): feature_store_id = 99 mocker.patch("hsfs.client.get_instance") - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine" - ) mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") mock_td_api = mocker.patch("hsfs.core.training_dataset_api.TrainingDatasetApi") @@ -75,9 +70,6 @@ def test_save_query(self, mocker, backend_fixtures): mocker.patch("hsfs.client.get_instance") mocker.patch("hsfs.engine.get_type") - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine" - ) mocker.patch("hsfs.engine.get_instance") mock_td_api = mocker.patch("hsfs.core.training_dataset_api.TrainingDatasetApi") @@ -106,72 +98,12 @@ def test_save_query(self, mocker, backend_fixtures): assert td._features[0].label is True assert td._features[1].label is True - def test_save_transformation_functions(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch("hsfs.client.get_instance") - mocker.patch( - "hsfs.transformation_function.TransformationFunction._extract_source_code" - ) - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine" - ) - mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") - mock_td_api = mocker.patch("hsfs.core.training_dataset_api.TrainingDatasetApi") - - def plus_one(a): - return a + 1 - - tf = transformation_function.TransformationFunction( - 1, plus_one, 1, "plus_one", output_type=str - ) - - td = training_dataset.TrainingDataset( - name="test", - version=1, - data_format="CSV", - featurestore_id=feature_store_id, - splits={}, - label=["f", "f_wrong"], - transformation_functions=tf, - ) - - td_engine = training_dataset_engine.TrainingDatasetEngine(feature_store_id) - - f = training_dataset_feature.TrainingDatasetFeature( - name="f", type="str", label=False - ) - f1 = training_dataset_feature.TrainingDatasetFeature( - name="f1", type="int", label=False - ) - - features = [f, f1] - - mock_engine_get_instance.return_value.parse_schema_training_dataset.return_value = features - - # Act - with pytest.raises(ValueError) as e_info: - td_engine.save(training_dataset=td, features=None, user_write_options=None) - - # Assert - assert mock_td_api.return_value.post.call_count == 0 - assert len(td._features) == 2 - assert td._features[0].label is True - assert td._features[1].label is False - assert ( - str(e_info.value) - == "Transformation functions can only be applied to training datasets generated from Query object" - ) - def test_save_splits(self, mocker): # Arrange feature_store_id = 99 mocker.patch("hsfs.client.get_instance") - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine" - ) + mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") mock_td_api = mocker.patch("hsfs.core.training_dataset_api.TrainingDatasetApi") mock_warning = mocker.patch("warnings.warn") @@ -210,8 +142,7 @@ def test_save_splits(self, mocker): assert ( mock_warning.call_args[0][0] == "Training dataset splits were defined but no `train_split` (the name of the split that is going to be " - "used for training) was provided. Setting this property to `train`. The statistics of this " - "split will be used for transformation functions." + "used for training) was provided. Setting this property to `train`. " ) def test_insert(self, mocker): diff --git a/python/tests/core/test_transformation_function_engine.py b/python/tests/core/test_transformation_function_engine.py index fcbb85ab21..e56e820d87 100644 --- a/python/tests/core/test_transformation_function_engine.py +++ b/python/tests/core/test_transformation_function_engine.py @@ -14,10 +14,7 @@ # limitations under the License. # -import datetime - -import numpy -import pytest +import pandas as pd from hsfs import ( engine, feature, @@ -25,11 +22,9 @@ feature_view, training_dataset, transformation_function, - transformation_function_attached, ) -from hsfs.client.exceptions import FeatureStoreException -from hsfs.constructor.query import Query from hsfs.core import transformation_function_engine +from hsfs.hopsworks_udf import UDFType, udf fg1 = feature_group.FeatureGroup( @@ -88,9 +83,6 @@ def test_save(self, mocker): # Arrange feature_store_id = 99 - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.is_builtin" - ) mock_tf_api = mocker.patch( "hsfs.core.transformation_function_api.TransformationFunctionApi" ) @@ -99,61 +91,26 @@ def test_save(self, mocker): feature_store_id ) - tf = transformation_function.TransformationFunction( - feature_store_id, builtin_source_code="", output_type="str", name="tf_name" - ) - - # Act - with pytest.raises(ValueError) as e_info: - tf_engine.save(transformation_fn_instance=tf) - - # Assert - assert mock_tf_api.return_value.register_transformation_fn.call_count == 0 - assert ( - str(e_info.value) - == "Transformation function name 'tf_name' with version 1 is reserved for built-in " - "hsfs functions. Please use other name or version" - ) - - def test_save_is_builtin(self, mocker): - # Arrange - feature_store_id = 99 - - mock_tf_engine_is_builtin = mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.is_builtin" - ) - mock_tf_api = mocker.patch( - "hsfs.core.transformation_function_api.TransformationFunctionApi" - ) - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) + @udf(int) + def testFunction(col1): + return col1 + 1 tf = transformation_function.TransformationFunction( - feature_store_id, builtin_source_code="", output_type="str", name="tf_name" + feature_store_id, + hopsworks_udf=testFunction, + transformation_type=UDFType.MODEL_DEPENDENT, ) - mock_tf_engine_is_builtin.return_value = False - # Act - with pytest.raises(ValueError) as e_info: - tf_engine.save(transformation_fn_instance=tf) + tf_engine.save(transformation_fn_instance=tf) # Assert - assert mock_tf_api.return_value.register_transformation_fn.call_count == 0 - assert str(e_info.value) == "transformer must be callable" + assert mock_tf_api.return_value.register_transformation_fn.call_count == 1 - def test_save_is_builtin_callable(self, mocker): + def test_get_transformation_fn(self, mocker): # Arrange feature_store_id = 99 - mocker.patch( - "hsfs.transformation_function.TransformationFunction._extract_source_code" - ) - mock_tf_engine_is_builtin = mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.is_builtin" - ) mock_tf_api = mocker.patch( "hsfs.core.transformation_function_api.TransformationFunctionApi" ) @@ -162,43 +119,27 @@ def test_save_is_builtin_callable(self, mocker): feature_store_id ) - def testFunction(): - print("Test") + @udf(int) + def testFunction1(col1): + return col1 + 1 - tf = transformation_function.TransformationFunction( + tf1 = transformation_function.TransformationFunction( feature_store_id, - transformation_fn=testFunction, - builtin_source_code="", - output_type="str", + hopsworks_udf=testFunction1, + transformation_type=UDFType.MODEL_DEPENDENT, ) - mock_tf_engine_is_builtin.return_value = False - - # Act - tf_engine.save(transformation_fn_instance=tf) - - # Assert - assert mock_tf_api.return_value.register_transformation_fn.call_count == 1 - - def test_get_transformation_fn(self, mocker): - # Arrange - feature_store_id = 99 - - mock_tf_api = mocker.patch( - "hsfs.core.transformation_function_api.TransformationFunctionApi" - ) + @udf(float) + def testFunction2(data2, statistics_data2): + return data2 + 1 - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id + tf2 = transformation_function.TransformationFunction( + feature_store_id, + hopsworks_udf=testFunction2, + transformation_type=UDFType.MODEL_DEPENDENT, ) - tf = transformation_function.TransformationFunction( - feature_store_id, builtin_source_code="", output_type="str", name="tf_name" - ) - tf1 = transformation_function.TransformationFunction( - feature_store_id, builtin_source_code="", output_type="str", name="tf1_name" - ) - transformations = [tf, tf1] + transformations = [tf1, tf2] mock_tf_api.return_value.get_transformation_fn.return_value = transformations @@ -207,7 +148,7 @@ def test_get_transformation_fn(self, mocker): # Assert assert mock_tf_api.return_value.get_transformation_fn.call_count == 1 - assert result == tf + assert result == transformations def test_get_transformation_fns(self, mocker): # Arrange @@ -221,13 +162,27 @@ def test_get_transformation_fns(self, mocker): feature_store_id ) - tf = transformation_function.TransformationFunction( - feature_store_id, builtin_source_code="", output_type="str", name="tf_name" - ) + @udf(int) + def testFunction1(col1): + return col1 + 1 + tf1 = transformation_function.TransformationFunction( - feature_store_id, builtin_source_code="", output_type="str", name="tf1_name" + feature_store_id, + hopsworks_udf=testFunction1, + transformation_type=UDFType.MODEL_DEPENDENT, + ) + + @udf(float) + def testFunction2(data2, statistics_data2): + return data2 + 1 + + tf2 = transformation_function.TransformationFunction( + feature_store_id, + hopsworks_udf=testFunction2, + transformation_type=UDFType.MODEL_DEPENDENT, ) - transformations = [tf, tf1] + + transformations = [tf1, tf2] mock_tf_api.return_value.get_transformation_fn.return_value = transformations @@ -250,1332 +205,283 @@ def test_delete(self, mocker): feature_store_id ) - # Act - tf_engine.delete(transformation_function_instance=None) - - # Assert - assert mock_tf_api.return_value.delete.call_count == 1 - - def test_get_td_transformation_fn(self, mocker): - # Arrange - feature_store_id = 99 - - mock_tf_api = mocker.patch( - "hsfs.core.transformation_function_api.TransformationFunctionApi" - ) - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def plus_one(a): - return a + 1 - - tf_attached = transformation_function_attached.TransformationFunctionAttached( - name="tf_name", transformation_function=plus_one - ) - tf1_attached = transformation_function_attached.TransformationFunctionAttached( - name="tf1_name", transformation_function=plus_one - ) - - transformations_attached = [tf_attached, tf1_attached] + @udf(int) + def testFunction1(col1): + return col1 + 1 - mock_tf_api.return_value.get_td_transformation_fn.return_value = ( - transformations_attached + tf1 = transformation_function.TransformationFunction( + feature_store_id, + hopsworks_udf=testFunction1, + transformation_type=UDFType.MODEL_DEPENDENT, ) # Act - result = tf_engine.get_td_transformation_fn(training_dataset=None) + tf_engine.delete(transformation_function_instance=tf1) # Assert - assert "tf_name" in result - assert "tf1_name" in result - assert mock_tf_api.return_value.get_td_transformation_fn.call_count == 1 + assert mock_tf_api.return_value.delete.call_count == 1 - def test_attach_transformation_fn_td(self, mocker): + def test_compute_transformation_fn_statistics(self, mocker): # Arrange feature_store_id = 99 mocker.patch("hsfs.client.get_instance") - mocker.patch("hsfs.constructor.fs_query.FsQuery") + mock_s_engine = mocker.patch("hsfs.core.statistics_engine.StatisticsEngine") tf_engine = transformation_function_engine.TransformationFunctionEngine( feature_store_id ) - def testFunction(): - print("Test") - - tf = transformation_function.TransformationFunction( - feature_store_id, - transformation_fn=testFunction, - builtin_source_code="", - output_type="str", - ) - - transformation_fn_dict = dict() - - transformation_fn_dict["tf_name"] = tf - transformation_fn_dict["tf1_name"] = tf - td = training_dataset.TrainingDataset( name="test", version=1, data_format="CSV", - featurestore_id=feature_store_id, + featurestore_id=99, splits={}, id=10, - transformation_functions=transformation_fn_dict, ) # Act - with pytest.raises(AttributeError) as e_info: - tf_engine.attach_transformation_fn( - training_dataset_obj=td, feature_view_obj=None - ) + tf_engine.compute_transformation_fn_statistics( + training_dataset_obj=td, + statistics_features=None, + label_encoder_features=None, + feature_dataframe=None, + feature_view_obj=None, + ) # Assert - assert str(e_info.value) == "'TrainingDataset' object has no attribute 'labels'" + assert ( + mock_s_engine.return_value.compute_transformation_fn_statistics.call_count + == 1 + ) - def test_attach_transformation_fn_fv(self, mocker): - # Arrange + def test_compute_and_set_feature_statistics_no_split(self, mocker): feature_store_id = 99 - mocker.patch("hsfs.client.get_instance") + mock_s_engine = mocker.patch("hsfs.core.statistics_engine.StatisticsEngine") tf_engine = transformation_function_engine.TransformationFunctionEngine( feature_store_id ) - def testFunction(): - print("Test") + @udf(int) + def testFunction1(col1): + return col1 + 1 - tf = transformation_function.TransformationFunction( + tf1 = transformation_function.TransformationFunction( feature_store_id, - transformation_fn=testFunction, - builtin_source_code="", - output_type="str", + hopsworks_udf=testFunction1, + transformation_type=UDFType.MODEL_DEPENDENT, ) - transformation_fn_dict = dict() - - transformation_fn_dict["tf_name"] = tf - transformation_fn_dict["tf1_name"] = tf + fg1 = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("label")], + id=11, + stream=False, + ) - fv = feature_view.FeatureView( + td = training_dataset.TrainingDataset( name="test", - query=query, + version=1, + data_format="CSV", featurestore_id=99, - transformation_functions=transformation_fn_dict, - labels=[], + splits={}, + id=10, ) # Act - tf_engine.attach_transformation_fn( - training_dataset_obj=None, feature_view_obj=fv - ) - - # Assert - assert len(fv._features) == 2 - assert fv._features[0].name == "tf_name" - assert fv._features[1].name == "tf1_name" - - def test_attach_transformation_fn_fv_self_join(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch("hsfs.client.get_instance") - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def testFunction(): - print("Test") - - tf = transformation_function.TransformationFunction( - feature_store_id, - transformation_fn=testFunction, - builtin_source_code="", - output_type="str", - ) - - transformation_fn_dict = dict() - - transformation_fn_dict["tf_name"] = tf - transformation_fn_dict["fg1_tf_name"] = tf - fv = feature_view.FeatureView( name="test", - query=query_self_join, - featurestore_id=99, - transformation_functions=transformation_fn_dict, - labels=[], + featurestore_id=feature_store_id, + query=fg1.select_all(), + transformation_functions=[tf1], ) + dataset = pd.DataFrame() + # Act - tf_engine.attach_transformation_fn( - training_dataset_obj=None, feature_view_obj=fv + tf_engine.compute_and_set_feature_statistics( + training_dataset=td, feature_view_obj=fv, dataset=dataset ) # Assert - assert len(fv._features) == 2 - assert fv._features[0].name == "tf_name" - assert fv._features[1].name == "fg1_tf_name" + assert ( + mock_s_engine.return_value.compute_transformation_fn_statistics.call_count + == 0 + ) - def test_attach_transformation_fn_fv_q_prefix(self, mocker): - # Arrange + def test_compute_and_set_feature_statistics_train_test_split(self, mocker): feature_store_id = 99 - mocker.patch("hsfs.client.get_instance") + mock_s_engine = mocker.patch("hsfs.core.statistics_engine.StatisticsEngine") tf_engine = transformation_function_engine.TransformationFunctionEngine( feature_store_id ) - def testFunction(): - print("Test") + @udf(int) + def testFunction1(col1): + return col1 + 1 - tf = transformation_function.TransformationFunction( + tf1 = transformation_function.TransformationFunction( feature_store_id, - transformation_fn=testFunction, - builtin_source_code="", - output_type="str", + hopsworks_udf=testFunction1, + transformation_type=UDFType.MODEL_DEPENDENT, ) - transformation_fn_dict = dict() - - transformation_fn_dict["tf_name"] = tf - transformation_fn_dict["second_tf1_name"] = tf - transformation_fn_dict["third_tf_name"] = tf - transformation_fn_dict["third_tf1_name"] = tf - - fv = feature_view.FeatureView( - name="test", - query=query_prefix, + fg1 = feature_group.FeatureGroup( + name="test1", + version=1, featurestore_id=99, - transformation_functions=transformation_fn_dict, - labels=[], - ) - - # Act - tf_engine.attach_transformation_fn( - training_dataset_obj=None, feature_view_obj=fv + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("label")], + id=11, + stream=False, ) - # Assert - assert len(fv._features) == 4 - assert fv._features[0].name == "tf_name" - assert fv._features[1].name == "second_tf1_name" - assert fv._features[2].name == "third_tf_name" - assert fv._features[3].name == "third_tf1_name" - - def test_attach_transformation_fn_fv_q_prefix_fail(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch("hsfs.client.get_instance") - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def testFunction(): - print("Test") - - query_no_prefix = ( - fg1.select_all() - .join(fg2.select(["tf1_name"]), on=["id"]) - .join(fg3.select(["tf_name", "tf1_name"]), on=["id"]) - ) - - tf = transformation_function.TransformationFunction( - feature_store_id, - transformation_fn=testFunction, - builtin_source_code="", - output_type="str", + td = training_dataset.TrainingDataset( + name="test", + version=1, + data_format="CSV", + featurestore_id=99, + splits={"train": 0.8, "test": 0.2}, + id=10, ) - transformation_fn_dict = dict() - transformation_fn_dict["tf_name"] = tf - transformation_fn_dict["tf1_name"] = tf - fv = feature_view.FeatureView( name="test", - query=query_no_prefix, - featurestore_id=99, - transformation_functions=transformation_fn_dict, - labels=[], + featurestore_id=feature_store_id, + query=fg1.select_all(), + transformation_functions=[tf1], ) + dataset = pd.DataFrame() + # Act - with pytest.raises(FeatureStoreException) as e_info: - tf_engine.attach_transformation_fn( - training_dataset_obj=None, feature_view_obj=fv - ) + tf_engine.compute_and_set_feature_statistics( + training_dataset=td, feature_view_obj=fv, dataset=dataset + ) # Assert - assert str(e_info.value) == Query.ERROR_MESSAGE_FEATURE_AMBIGUOUS.format( - "tf_name" + assert ( + mock_s_engine.return_value.compute_transformation_fn_statistics.call_count + == 0 ) - def test_attach_transformation_fn_fv_labels(self, mocker): - # Arrange + def test_get_and_set_feature_statistics_no_statistics_required(self, mocker): feature_store_id = 99 - mocker.patch("hsfs.client.get_instance") + mock_s_engine = mocker.patch("hsfs.core.statistics_engine.StatisticsEngine") tf_engine = transformation_function_engine.TransformationFunctionEngine( feature_store_id ) - def testFunction(): - print("Test") + @udf(int) + def testFunction1(col1): + return col1 + 1 - tf = transformation_function.TransformationFunction( + tf1 = transformation_function.TransformationFunction( feature_store_id, - transformation_fn=testFunction, - builtin_source_code="", - output_type="str", + hopsworks_udf=testFunction1, + transformation_type=UDFType.MODEL_DEPENDENT, ) - transformation_fn_dict = dict() + fg1 = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("label")], + id=11, + stream=False, + ) - transformation_fn_dict["tf_name"] = tf - transformation_fn_dict["tf1_name"] = tf + td = training_dataset.TrainingDataset( + name="test", + version=1, + data_format="CSV", + featurestore_id=99, + splits={"train": 0.8, "test": 0.2}, + id=10, + ) fv = feature_view.FeatureView( name="test", - query=query, - featurestore_id=99, - transformation_functions=transformation_fn_dict, - labels=["tf_name"], + featurestore_id=feature_store_id, + query=fg1.select_all(), + transformation_functions=[tf1], ) # Act - with pytest.raises(ValueError) as e_info: - tf_engine.attach_transformation_fn( - training_dataset_obj=None, feature_view_obj=fv - ) + tf_engine.get_and_set_feature_statistics( + training_dataset=td, feature_view_obj=fv, training_dataset_version=1 + ) # Assert - assert ( - str(e_info.value) - == "Online transformations for training dataset labels are not supported." - ) + assert mock_s_engine.return_value.get.call_count == 0 - def test_is_builtin(self): - # Arrange + def test_get_and_set_feature_statistics_statistics_required(self, mocker): feature_store_id = 99 + mocker.patch("hsfs.client.get_instance") + mock_s_engine = mocker.patch("hsfs.core.statistics_engine.StatisticsEngine") tf_engine = transformation_function_engine.TransformationFunctionEngine( feature_store_id ) + from hsfs.transformation_statistics import TransformationStatistics - tf = transformation_function.TransformationFunction( - feature_store_id, - builtin_source_code="", - output_type="str", - name="tf_name", - version=1, - ) - - # Act - result = tf_engine.is_builtin(transformation_fn_instance=tf) - - # Assert - assert result is False + stats = TransformationStatistics("col1") - def test_is_builtin_min_max_scaler(self): - # Arrange - feature_store_id = 99 + @udf(int) + def testFunction1(col1, statistics=stats): + return col1 + statistics.col1.mean - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id + tf1 = transformation_function.TransformationFunction( + feature_store_id, + hopsworks_udf=testFunction1, + transformation_type=UDFType.MODEL_DEPENDENT, ) - tf = transformation_function.TransformationFunction( - feature_store_id, - builtin_source_code="", - output_type="str", - name="min_max_scaler", + fg1 = feature_group.FeatureGroup( + name="test1", version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("label")], + id=11, + stream=False, ) - # Act - result = tf_engine.is_builtin(transformation_fn_instance=tf) - - # Assert - assert result is True - - def test_is_builtin_min_max_scaler_version(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id + td = training_dataset.TrainingDataset( + name="test", + version=1, + data_format="CSV", + featurestore_id=99, + splits={"train": 0.8, "test": 0.2}, + id=10, ) - tf = transformation_function.TransformationFunction( - feature_store_id, - builtin_source_code="", - output_type="str", - name="min_max_scaler", - version=2, + fv = feature_view.FeatureView( + name="test", + featurestore_id=feature_store_id, + query=fg1.select_all(), + transformation_functions=[tf1], ) # Act - result = tf_engine.is_builtin(transformation_fn_instance=tf) - - # Assert - assert result is False - - def test_is_builtin_standard_scaler(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - tf = transformation_function.TransformationFunction( - feature_store_id, - builtin_source_code="", - output_type="str", - name="standard_scaler", - version=1, - ) - - # Act - result = tf_engine.is_builtin(transformation_fn_instance=tf) - - # Assert - assert result is True - - def test_is_builtin_robust_scaler(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - tf = transformation_function.TransformationFunction( - feature_store_id, - builtin_source_code="", - output_type="str", - name="robust_scaler", - version=1, - ) - - # Act - result = tf_engine.is_builtin(transformation_fn_instance=tf) - - # Assert - assert result is True - - def test_is_builtin_label_encoder(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - tf = transformation_function.TransformationFunction( - feature_store_id, - builtin_source_code="", - output_type="str", - name="label_encoder", - version=1, - ) - - # Act - result = tf_engine.is_builtin(transformation_fn_instance=tf) - - # Assert - assert result is True - - def test_populate_builtin_fn_arguments(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def tf_name(): - print("Test") - - tf = transformation_function.TransformationFunction( - feature_store_id, transformation_fn=tf_name, output_type="str" - ) - - # Act - with pytest.raises(ValueError) as e_info: - tf_engine.populate_builtin_fn_arguments( - feature_name=None, - transformation_function_instance=tf, - feature_descriptive_stats=None, - ) - - # Assert - assert str(e_info.value) == "Not implemented" - - def test_populate_builtin_fn_arguments_min_max_scaler(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch( - "hsfs.core.builtin_transformation_function.BuiltInTransformationFunction.min_max_scaler_stats", - return_value=(1, 100), - ) - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def min_max_scaler(): - print("Test") - - tf = transformation_function.TransformationFunction( - feature_store_id, transformation_fn=min_max_scaler, output_type="str" - ) - - # Act - tf_engine.populate_builtin_fn_arguments( - feature_name=None, - transformation_function_instance=tf, - feature_descriptive_stats=None, - ) - - # Assert - assert tf.transformation_fn.keywords["min_value"] == 1 - assert tf.transformation_fn.keywords["max_value"] == 100 - - def test_populate_builtin_fn_arguments_standard_scaler(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch( - "hsfs.core.builtin_transformation_function.BuiltInTransformationFunction.standard_scaler_stats", - return_value=(1, 100), - ) - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def standard_scaler(): - print("Test") - - tf = transformation_function.TransformationFunction( - feature_store_id, transformation_fn=standard_scaler, output_type="str" - ) - - # Act - tf_engine.populate_builtin_fn_arguments( - feature_name=None, - transformation_function_instance=tf, - feature_descriptive_stats=None, - ) - - # Assert - assert tf.transformation_fn.keywords["mean"] == 1 - assert tf.transformation_fn.keywords["std_dev"] == 100 - - def test_populate_builtin_fn_arguments_robust_scaler(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch( - "hsfs.core.builtin_transformation_function.BuiltInTransformationFunction.robust_scaler_stats", - return_value={24: 1, 49: 2, 74: 3}, - ) - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def robust_scaler(): - print("Test") - - tf = transformation_function.TransformationFunction( - feature_store_id, transformation_fn=robust_scaler, output_type="str" - ) - - # Act - tf_engine.populate_builtin_fn_arguments( - feature_name=None, - transformation_function_instance=tf, - feature_descriptive_stats=None, - ) - - # Assert - assert tf.transformation_fn.keywords["p25"] == 1 - assert tf.transformation_fn.keywords["p50"] == 2 - assert tf.transformation_fn.keywords["p75"] == 3 - - def test_populate_builtin_fn_arguments_label_encoder(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch( - "hsfs.core.builtin_transformation_function.BuiltInTransformationFunction.encoder_stats", - return_value="test", - ) - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def label_encoder(): - print("Test") - - tf = transformation_function.TransformationFunction( - feature_store_id, transformation_fn=label_encoder, output_type="str" - ) - - # Act - tf_engine.populate_builtin_fn_arguments( - feature_name=None, - transformation_function_instance=tf, - feature_descriptive_stats=None, - ) - - # Assert - assert tf.transformation_fn.keywords["value_to_index"] == "test" - - def test_populate_builtin_attached_fns(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.is_builtin", - return_value=False, - ) - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_fn_arguments" - ) - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def testFunction(): - print("Test") - - tf_attached = transformation_function_attached.TransformationFunctionAttached( - name="tf_name", transformation_function=testFunction - ) - tf1_attached = transformation_function_attached.TransformationFunctionAttached( - name="tf1_name", transformation_function=testFunction - ) - - transformation_fn_dict = dict() - - transformation_fn_dict["tf_name"] = tf_attached - transformation_fn_dict["tf1_name"] = tf1_attached - - # Act - tf_engine.populate_builtin_attached_fns( - attached_transformation_fns=transformation_fn_dict, - feature_descriptive_stats=None, - ) - - # Assert - assert transformation_fn_dict["tf_name"] == tf_attached - assert transformation_fn_dict["tf1_name"] == tf1_attached - - def test_populate_builtin_attached_fns_is_builtin(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.is_builtin" - ) - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_fn_arguments" - ) - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def testFunction(): - print("Test") - - tf_attached = transformation_function_attached.TransformationFunctionAttached( - name="tf_name", transformation_function=testFunction - ) - tf1_attached = transformation_function_attached.TransformationFunctionAttached( - name="tf1_name", transformation_function=testFunction - ) - - transformation_fn_dict = dict() - - transformation_fn_dict["tf_name"] = tf_attached - transformation_fn_dict["tf1_name"] = tf1_attached - - # Act - tf_engine.populate_builtin_attached_fns( - attached_transformation_fns=transformation_fn_dict, - feature_descriptive_stats=None, - ) - - # Assert - assert transformation_fn_dict["tf_name"] != tf_attached - assert transformation_fn_dict["tf1_name"] != tf1_attached - - def test_infer_spark_type_string_type_1(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(str) - - # Assert - assert result == "STRING" - - def test_infer_spark_type_string_type_2(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("str") - - # Assert - assert result == "STRING" - - def test_infer_spark_type_string_type_3(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("string") - - # Assert - assert result == "STRING" - - def test_infer_spark_type_byte_type_1(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(bytes) - result1 = tf_engine.infer_spark_type("BinaryType()") - - # Assert - assert result == "BINARY" - assert result1 == "BINARY" - - def test_infer_spark_type_int8_type_1(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(numpy.int8) - - # Assert - assert result == "BYTE" - - def test_infer_spark_type_int8_type_2(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("int8") - - # Assert - assert result == "BYTE" - - def test_infer_spark_type_int8_type_3(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("byte") - result1 = tf_engine.infer_spark_type("ByteType()") - - # Assert - assert result == "BYTE" - assert result1 == "BYTE" - - def test_infer_spark_type_int16_type_1(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(numpy.int16) - - # Assert - assert result == "SHORT" - - def test_infer_spark_type_int16_type_2(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("int16") - - # Assert - assert result == "SHORT" - - def test_infer_spark_type_int16_type_3(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("short") - result1 = tf_engine.infer_spark_type("ShortType()") - - # Assert - assert result == "SHORT" - assert result1 == "SHORT" - - def test_infer_spark_type_int_type_1(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(int) - - # Assert - assert result == "INT" - - def test_infer_spark_type_int_type_2(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("int") - - # Assert - assert result == "INT" - - def test_infer_spark_type_int_type_3(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(numpy.int32) - result1 = tf_engine.infer_spark_type("IntegerType()") - - # Assert - assert result == "INT" - assert result1 == "INT" - - def test_infer_spark_type_int64_type_1(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(numpy.int64) - - # Assert - assert result == "LONG" - - def test_infer_spark_type_int64_type_2(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("int64") - - # Assert - assert result == "LONG" - - def test_infer_spark_type_int64_type_3(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("long") - - # Assert - assert result == "LONG" - - def test_infer_spark_type_int64_type_4(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("bigint") - result1 = tf_engine.infer_spark_type("LongType()") - - # Assert - assert result == "LONG" - assert result1 == "LONG" - - def test_infer_spark_type_float_type_1(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(float) - - # Assert - assert result == "FLOAT" - - def test_infer_spark_type_float_type_2(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("float") - result1 = tf_engine.infer_spark_type("FloatType()") - - # Assert - assert result == "FLOAT" - assert result1 == "FLOAT" - - def test_infer_spark_type_double_type_1(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(numpy.float64) - - # Assert - assert result == "DOUBLE" - - def test_infer_spark_type_double_type_2(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("float64") - - # Assert - assert result == "DOUBLE" - - def test_infer_spark_type_double_type_3(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("double") - result1 = tf_engine.infer_spark_type("DoubleType()") - - # Assert - assert result == "DOUBLE" - assert result1 == "DOUBLE" - - def test_infer_spark_type_timestamp_type_1(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(datetime.datetime) - - # Assert - assert result == "TIMESTAMP" - - def test_infer_spark_type_timestamp_type_2(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(numpy.datetime64) - result1 = tf_engine.infer_spark_type("TimestampType()") - - # Assert - assert result == "TIMESTAMP" - assert result1 == "TIMESTAMP" - - def test_infer_spark_type_date_type_1(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(datetime.date) - result1 = tf_engine.infer_spark_type("DateType()") - - # Assert - assert result == "DATE" - assert result1 == "DATE" - - def test_infer_spark_type_bool_type_1(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type(bool) - - # Assert - assert result == "BOOLEAN" - - def test_infer_spark_type_bool_type_2(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("boolean") - - # Assert - assert result == "BOOLEAN" - - def test_infer_spark_type_bool_type_3(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - result = tf_engine.infer_spark_type("bool") - result1 = tf_engine.infer_spark_type("BooleanType()") - - # Assert - assert result == "BOOLEAN" - assert result1 == "BOOLEAN" - - def test_infer_spark_type_wrong_type(self): - # Arrange - feature_store_id = 99 - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - # Act - with pytest.raises(TypeError) as e_info: - tf_engine.infer_spark_type("wrong") - - # Assert - assert str(e_info.value) == "Not supported type wrong." - - def test_compute_transformation_fn_statistics(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch("hsfs.client.get_instance") - mock_s_engine = mocker.patch("hsfs.core.statistics_engine.StatisticsEngine") - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - td = training_dataset.TrainingDataset( - name="test", - version=1, - data_format="CSV", - featurestore_id=99, - splits={}, - id=10, - ) - - # Act - tf_engine.compute_transformation_fn_statistics( - training_dataset_obj=td, - builtin_tffn_features=None, - label_encoder_features=None, - feature_dataframe=None, - feature_view_obj=None, - ) - - # Assert - assert ( - mock_s_engine.return_value.compute_transformation_fn_statistics.call_count - == 1 - ) - - def test_populate_builtin_transformation_functions(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch("hsfs.client.get_instance") - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.is_builtin" - ) - mock_tf_engine_compute_transformation_fn_statistics = mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.compute_transformation_fn_statistics" - ) - mock_tf_engine_populate_builtin_attached_fns = mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_attached_fns" - ) - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def testFunction(): - print("Test") - - tf = transformation_function.TransformationFunction( - feature_store_id, - transformation_fn=testFunction, - builtin_source_code="", - output_type="str", - ) - - def label_encoder(): - print("Test") - - tf_label_encoder = transformation_function.TransformationFunction( - feature_store_id, - transformation_fn=label_encoder, - builtin_source_code="", - output_type="str", - ) - - transformation_fn_dict = dict() - - transformation_fn_dict["tf_name"] = tf - transformation_fn_dict["label_encoder"] = tf_label_encoder - - td = training_dataset.TrainingDataset( - name="test", - version=1, - data_format="CSV", - featurestore_id=feature_store_id, - splits={}, - id=10, - transformation_functions=transformation_fn_dict, - ) - - dataset = mocker.Mock() - - # Act - tf_engine.populate_builtin_transformation_functions( - training_dataset=td, feature_view_obj=None, dataset=dataset - ) - - # Assert - assert mock_tf_engine_compute_transformation_fn_statistics.call_count == 1 - assert mock_tf_engine_populate_builtin_attached_fns.call_count == 1 - assert dataset.get.call_count == 0 - - def test_populate_builtin_transformation_functions_splits(self, mocker): - # Arrange - feature_store_id = 99 - - mocker.patch("hsfs.client.get_instance") - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.is_builtin" - ) - mock_tf_engine_compute_transformation_fn_statistics = mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.compute_transformation_fn_statistics" - ) - mock_tf_engine_populate_builtin_attached_fns = mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_attached_fns" - ) - - tf_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id - ) - - def testFunction(): - print("Test") - - tf = transformation_function.TransformationFunction( - feature_store_id, - transformation_fn=testFunction, - builtin_source_code="", - output_type="str", - ) - - def label_encoder(): - print("Test") - - tf_label_encoder = transformation_function.TransformationFunction( - feature_store_id, - transformation_fn=label_encoder, - builtin_source_code="", - output_type="str", - ) - - transformation_fn_dict = dict() - - transformation_fn_dict["tf_name"] = tf - transformation_fn_dict["label_encoder"] = tf_label_encoder - - td = training_dataset.TrainingDataset( - name="test", - version=1, - data_format="CSV", - featurestore_id=feature_store_id, - splits={"key": "value"}, - id=10, - transformation_functions=transformation_fn_dict, - ) - - dataset = mocker.Mock() - - # Act - tf_engine.populate_builtin_transformation_functions( - training_dataset=td, feature_view_obj=None, dataset=dataset - ) - - # Assert - assert mock_tf_engine_compute_transformation_fn_statistics.call_count == 1 - assert mock_tf_engine_populate_builtin_attached_fns.call_count == 1 - assert dataset.get.call_count == 1 - - # Previously in test_feature_view_engine - def test_get_fv_attached_transformation_fn(self, mocker): - # Arrange - feature_store_id = 99 - mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi") - td_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id=feature_store_id - ) - - def testFunction(): - print("Test") - - tf = transformation_function_attached.TransformationFunctionAttached( - name="tf_name", transformation_function=testFunction - ) - - mock_fv_api.return_value.get_attached_transformation_fn.return_value = tf - - # Act - result = td_engine.get_fv_attached_transformation_fn( - fv_name="fv_name", fv_version=1 - ) - - # Assert - assert "tf_name" in result - assert mock_fv_api.return_value.get_attached_transformation_fn.call_count == 1 - - def test_get_fv_attached_transformation_fn_multiple(self, mocker): - # Arrange - feature_store_id = 99 - - mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi") - - td_engine = transformation_function_engine.TransformationFunctionEngine( - feature_store_id=feature_store_id - ) - - def testFunction(): - print("Test") - - tf = transformation_function_attached.TransformationFunctionAttached( - name="tf_name", transformation_function=testFunction - ) - tf1 = transformation_function_attached.TransformationFunctionAttached( - name="tf1_name", transformation_function=testFunction - ) - - mock_fv_api.return_value.get_attached_transformation_fn.return_value = [tf, tf1] - - # Act - result = td_engine.get_fv_attached_transformation_fn( - fv_name="fv_name", fv_version=1 + tf_engine.get_and_set_feature_statistics( + training_dataset=td, feature_view_obj=fv, training_dataset_version=1 ) # Assert - assert "tf_name" in result - assert "tf1_name" in result - assert mock_fv_api.return_value.get_attached_transformation_fn.call_count == 1 + assert mock_s_engine.return_value.get.call_count == 1 diff --git a/python/tests/engine/test_python.py b/python/tests/engine/test_python.py index 08bc8d52a7..cbbe190c4d 100644 --- a/python/tests/engine/test_python.py +++ b/python/tests/engine/test_python.py @@ -23,12 +23,12 @@ import pytest from confluent_kafka.admin import PartitionMetadata, TopicMetadata from hsfs import ( + engine, feature, feature_group, feature_view, storage_connector, training_dataset, - transformation_function, util, ) from hsfs.client import exceptions @@ -36,10 +36,15 @@ from hsfs.constructor.hudi_feature_group_alias import HudiFeatureGroupAlias from hsfs.core import inode, job from hsfs.engine import python +from hsfs.hopsworks_udf import UDFType, udf from hsfs.training_dataset_feature import TrainingDatasetFeature +from hsfs.transformation_function import TransformationFunction from polars.testing import assert_frame_equal as polars_assert_frame_equal +engine._engine_type = "python" + + class TestPython: def test_sql(self, mocker): # Arrange @@ -1456,7 +1461,6 @@ def test_parse_schema_feature_group_polars(self, mocker): result = python_engine.parse_schema_feature_group( dataframe=df, time_travel_format=None ) - print(result) # Assert assert len(result) == 3 @@ -1464,6 +1468,71 @@ def test_parse_schema_feature_group_polars(self, mocker): assert result[1].name == "col2" assert result[2].name == "date" + def test_parse_schema_feature_group_transformation_functions(self, mocker): + # Arrange + mocker.patch("hsfs.engine.python.Engine._convert_pandas_dtype_to_offline_type") + + python_engine = python.Engine() + + d = {"Col1": [1, 2], "col2": [3, 4]} + df = pd.DataFrame(data=d) + + @udf(int) + def test(feature): + return feature + 1 + + transformation_function = TransformationFunction( + featurestore_id=10, + hopsworks_udf=test, + version=1, + transformation_type=UDFType.ON_DEMAND, + ) + + # Act + result = python_engine.parse_schema_feature_group( + dataframe=df, + time_travel_format=None, + transformation_functions=[transformation_function], + ) + + # Assert + assert len(result) == 3 + assert result[0].name == "col1" + assert result[1].name == "col2" + assert result[2].name == "test" + + def test_parse_schema_feature_group_transformation_functions_drop(self, mocker): + # Arrange + mocker.patch("hsfs.engine.python.Engine._convert_pandas_dtype_to_offline_type") + + python_engine = python.Engine() + + d = {"Col1": [1, 2], "col2": [3, 4]} + df = pd.DataFrame(data=d) + + @udf(int, drop="feature") + def test(feature): + return feature + 1 + + transformation_function = TransformationFunction( + featurestore_id=10, + hopsworks_udf=test("col2"), + version=1, + transformation_type=UDFType.ON_DEMAND, + ) + + # Act + result = python_engine.parse_schema_feature_group( + dataframe=df, + time_travel_format=None, + transformation_functions=[transformation_function], + ) + + # Assert + assert len(result) == 2 + assert result[0].name == "col1" + assert result[1].name == "test" + def test_parse_schema_training_dataset(self): # Arrange python_engine = python.Engine() @@ -2132,6 +2201,52 @@ def test_save_dataframe(self, mocker): assert mock_python_engine_write_dataframe_kafka.call_count == 0 assert mock_python_engine_legacy_save_dataframe.call_count == 1 + def test_save_dataframe_transformation_functions(self, mocker): + # Arrange + mock_python_engine_write_dataframe_kafka = mocker.patch( + "hsfs.engine.python.Engine._write_dataframe_kafka" + ) + mock_python_engine_legacy_save_dataframe = mocker.patch( + "hsfs.engine.python.Engine.legacy_save_dataframe" + ) + mock_python_engine_apply_transformations = mocker.patch( + "hsfs.engine.python.Engine._apply_transformation_function" + ) + + python_engine = python.Engine() + + @udf(int) + def test(feature): + return feature + 1 + + fg = feature_group.FeatureGroup( + name="test", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + id=10, + stream=False, + transformation_functions=[test], + ) + + # Act + python_engine.save_dataframe( + feature_group=fg, + dataframe=None, + operation=None, + online_enabled=None, + storage=None, + offline_write_options=None, + online_write_options=None, + validation_id=None, + ) + + # Assert + assert mock_python_engine_write_dataframe_kafka.call_count == 0 + assert mock_python_engine_legacy_save_dataframe.call_count == 1 + assert mock_python_engine_apply_transformations.call_count == 1 + def test_save_dataframe_stream(self, mocker): # Arrange mock_python_engine_write_dataframe_kafka = mocker.patch( @@ -2210,6 +2325,7 @@ def test_get_training_data(self, mocker): mocker.patch( "hsfs.core.transformation_function_engine.TransformationFunctionEngine" ) + mock_feature_view = mocker.patch("hsfs.feature_view.FeatureView") python_engine = python.Engine() @@ -2226,7 +2342,7 @@ def test_get_training_data(self, mocker): # Act python_engine.get_training_data( training_dataset_obj=td, - feature_view_obj=None, + feature_view_obj=mock_feature_view, query_obj=mocker.Mock(), read_options=None, dataframe_type="default", @@ -2423,7 +2539,7 @@ def test_split_labels_labels_dataframe_type_polars(self): result_df, result_df_split = python_engine.split_labels( df=df, dataframe_type="polars", labels="col1" ) - print(type(result_df_split)) + # Assert assert isinstance(result_df, pl.DataFrame) or isinstance( result_df, pl.dataframe.frame.DataFrame @@ -2473,6 +2589,7 @@ def test_prepare_transform_split_df_random_split(self, mocker): mocker.patch( "hsfs.core.transformation_function_engine.TransformationFunctionEngine" ) + mock_feature_view = mocker.patch("hsfs.feature_view.FeatureView") python_engine = python.Engine() @@ -2500,7 +2617,7 @@ def test_prepare_transform_split_df_random_split(self, mocker): result = python_engine._prepare_transform_split_df( query_obj=q, training_dataset_obj=td, - feature_view_obj=None, + feature_view_obj=mock_feature_view, read_option=None, dataframe_type="default", ) @@ -2521,6 +2638,7 @@ def test_prepare_transform_split_df_time_split_td_features(self, mocker): mocker.patch( "hsfs.core.transformation_function_engine.TransformationFunctionEngine" ) + mock_feature_view = mocker.patch("hsfs.feature_view.FeatureView") python_engine = python.Engine() @@ -2566,7 +2684,7 @@ def test_prepare_transform_split_df_time_split_td_features(self, mocker): result = python_engine._prepare_transform_split_df( query_obj=q, training_dataset_obj=td, - feature_view_obj=None, + feature_view_obj=mock_feature_view, read_option=None, dataframe_type="default", ) @@ -2587,6 +2705,7 @@ def test_prepare_transform_split_df_time_split_query_features(self, mocker): mocker.patch( "hsfs.core.transformation_function_engine.TransformationFunctionEngine" ) + mock_feature_view = mocker.patch("hsfs.feature_view.FeatureView") python_engine = python.Engine() @@ -2631,7 +2750,7 @@ def test_prepare_transform_split_df_time_split_query_features(self, mocker): result = python_engine._prepare_transform_split_df( query_obj=q, training_dataset_obj=td, - feature_view_obj=None, + feature_view_obj=mock_feature_view, read_option=None, dataframe_type="default", ) @@ -2957,6 +3076,7 @@ def test_write_training_dataset(self, mocker): def test_write_training_dataset_query_td(self, mocker, backend_fixtures): # Arrange + mocker.patch("hsfs.client.get_instance") mocker.patch("hsfs.engine.get_type") mocker.patch("hsfs.core.training_dataset_job_conf.TrainingDatasetJobConf") mock_job = mocker.patch("hsfs.core.job.Job") @@ -3001,6 +3121,7 @@ def test_write_training_dataset_query_td(self, mocker, backend_fixtures): def test_write_training_dataset_query_fv(self, mocker, backend_fixtures): # Arrange + mocker.patch("hsfs.client.get_instance") mocker.patch("hsfs.engine.get_type") mocker.patch("hsfs.core.training_dataset_job_conf.TrainingDatasetJobConf") mock_job = mocker.patch("hsfs.core.job.Job") @@ -3230,86 +3351,271 @@ def test_add_file(self): def test_apply_transformation_function_pandas(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") + engine._engine_type = "python" + python_engine = python.Engine() + + @udf(int) + def plus_one(col1): + return col1 + 1 + + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, + ) + + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), + featurestore_id=99, + transformation_functions=[plus_one("tf_name")], + ) + + df = pd.DataFrame(data={"tf_name": [1, 2]}) + + # Act + result = python_engine._apply_transformation_function( + transformation_functions=fv.transformation_functions, dataset=df + ) + + # Assert + assert len(result["plus_one_tf_name_"]) == 2 + assert result["plus_one_tf_name_"][0] == 2 + assert result["plus_one_tf_name_"][1] == 3 + def test_apply_transformation_function_multiple_output(self, mocker): + # Arrange + mocker.patch("hsfs.client.get_instance") + engine._engine_type = "python" python_engine = python.Engine() - def plus_one(a): - return a + 1 + @udf([int, int], drop=["col1"]) + def plus_two(col1): + return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col1 + 2}) - tf = transformation_function.TransformationFunction( - 99, - transformation_fn=plus_one, - builtin_source_code="", - output_type="int", + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, ) - transformation_fn_dict = dict() + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), + featurestore_id=99, + transformation_functions=[plus_two], + ) - transformation_fn_dict["tf_name"] = tf + df = pd.DataFrame(data={"col1": [1, 2], "col2": [10, 11]}) - td = training_dataset.TrainingDataset( - name="test", + # Act + result = python_engine._apply_transformation_function( + transformation_functions=fv.transformation_functions, dataset=df + ) + + # Assert + assert all(result.columns == ["col2", "plus_two_col1_0", "plus_two_col1_1"]) + assert len(result) == 2 + assert result["plus_two_col1_0"][0] == 2 + assert result["plus_two_col1_0"][1] == 3 + assert result["plus_two_col1_1"][0] == 3 + assert result["plus_two_col1_1"][1] == 4 + + def test_apply_transformation_function_multiple_input_output(self, mocker): + # Arrange + mocker.patch("hsfs.client.get_instance") + + engine._engine_type = "python" + python_engine = python.Engine() + + @udf([int, int]) + def plus_two(col1, col2): + return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2}) + + fg = feature_group.FeatureGroup( + name="test1", version=1, - data_format="CSV", featurestore_id=99, - splits={}, - id=10, - transformation_functions=transformation_fn_dict, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, ) - df = pd.DataFrame(data={"tf_name": [1, 2]}) + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), + featurestore_id=99, + transformation_functions=[plus_two], + ) + + df = pd.DataFrame(data={"col1": [1, 2], "col2": [10, 11]}) # Act result = python_engine._apply_transformation_function( - transformation_functions=td.transformation_functions, dataset=df + transformation_functions=fv.transformation_functions, dataset=df ) # Assert - assert len(result["tf_name"]) == 2 - assert result["tf_name"][0] == 2 - assert result["tf_name"][1] == 3 + assert all( + result.columns + == ["col1", "col2", "plus_two_col1_col2_0", "plus_two_col1_col2_1"] + ) + assert len(result) == 2 + assert result["col1"][0] == 1 + assert result["col1"][1] == 2 + assert result["col2"][0] == 10 + assert result["col2"][1] == 11 + assert result["plus_two_col1_col2_0"][0] == 2 + assert result["plus_two_col1_col2_0"][1] == 3 + assert result["plus_two_col1_col2_1"][0] == 12 + assert result["plus_two_col1_col2_1"][1] == 13 - def test_apply_transformation_function_polars(self, mocker): + def test_apply_transformation_function_multiple_input_output_drop_all(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") + engine._engine_type = "python" python_engine = python.Engine() - def plus_one(a): - return a + 1 + @udf([int, int], drop=["col1", "col2"]) + def plus_two(col1, col2): + return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2}) - tf = transformation_function.TransformationFunction( - 99, - transformation_fn=plus_one, - builtin_source_code="", - output_type="int", + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, ) - transformation_fn_dict = dict() + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), + featurestore_id=99, + transformation_functions=[plus_two], + ) - transformation_fn_dict["tf_name"] = tf + df = pd.DataFrame(data={"col1": [1, 2], "col2": [10, 11]}) - td = training_dataset.TrainingDataset( - name="test", + # Act + result = python_engine._apply_transformation_function( + transformation_functions=fv.transformation_functions, dataset=df + ) + + # Assert + assert all(result.columns == ["plus_two_col1_col2_0", "plus_two_col1_col2_1"]) + assert len(result) == 2 + assert result["plus_two_col1_col2_0"][0] == 2 + assert result["plus_two_col1_col2_0"][1] == 3 + assert result["plus_two_col1_col2_1"][0] == 12 + assert result["plus_two_col1_col2_1"][1] == 13 + + def test_apply_transformation_function_multiple_input_output_drop_some( + self, mocker + ): + # Arrange + mocker.patch("hsfs.client.get_instance") + + engine._engine_type = "python" + python_engine = python.Engine() + + @udf([int, int], drop=["col1"]) + def plus_two(col1, col2): + return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2}) + + fg = feature_group.FeatureGroup( + name="test1", version=1, - data_format="CSV", featurestore_id=99, - splits={}, - id=10, - transformation_functions=transformation_fn_dict, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, + ) + + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), + featurestore_id=99, + transformation_functions=[plus_two], + ) + + df = pd.DataFrame(data={"col1": [1, 2], "col2": [10, 11]}) + + # Act + result = python_engine._apply_transformation_function( + transformation_functions=fv.transformation_functions, dataset=df + ) + + # Assert + assert all( + result.columns == ["col2", "plus_two_col1_col2_0", "plus_two_col1_col2_1"] + ) + assert len(result) == 2 + assert result["col2"][0] == 10 + assert result["col2"][1] == 11 + assert result["plus_two_col1_col2_0"][0] == 2 + assert result["plus_two_col1_col2_0"][1] == 3 + assert result["plus_two_col1_col2_1"][0] == 12 + assert result["plus_two_col1_col2_1"][1] == 13 + + def test_apply_transformation_function_polars(self, mocker): + # Arrange + mocker.patch("hsfs.client.get_instance") + + engine._engine_type = "python" + python_engine = python.Engine() + + @udf(int) + def plus_one(col1): + return col1 + 1 + + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, + ) + + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), + featurestore_id=99, + transformation_functions=[plus_one("tf_name")], ) df = pl.DataFrame(data={"tf_name": [1, 2]}) # Act result = python_engine._apply_transformation_function( - transformation_functions=td.transformation_functions, dataset=df + transformation_functions=fv.transformation_functions, dataset=df ) # Assert - assert len(result["tf_name"]) == 2 - assert result["tf_name"][0] == 2 - assert result["tf_name"][1] == 3 + assert len(result["plus_one_tf_name_"]) == 2 + assert result["plus_one_tf_name_"][0] == 2 + assert result["plus_one_tf_name_"][1] == 3 def test_get_unique_values(self): # Arrange @@ -3763,7 +4069,7 @@ def test_materialization_kafka_first_job_execution(self, mocker): args="defaults tests_offsets", await_termination=False, ) - + def test_materialization_kafka_skip_offsets(self, mocker): # Arrange mocker.patch("hsfs.engine.python.Engine._get_kafka_config", return_value={}) @@ -3805,7 +4111,10 @@ def test_materialization_kafka_skip_offsets(self, mocker): python_engine._write_dataframe_kafka( feature_group=fg, dataframe=df, - offline_write_options={"start_offline_materialization": True, "skip_offsets": True}, + offline_write_options={ + "start_offline_materialization": True, + "skip_offsets": True, + }, ) # Assert diff --git a/python/tests/engine/test_python_spark_transformation_functions.py b/python/tests/engine/test_python_spark_transformation_functions.py index 0e25037751..8c29128641 100644 --- a/python/tests/engine/test_python_spark_transformation_functions.py +++ b/python/tests/engine/test_python_spark_transformation_functions.py @@ -16,26 +16,28 @@ from __future__ import annotations import datetime +import os import statistics -import numpy as np import pandas as pd import pytest -import pytz import tzlocal from hsfs import ( + engine, training_dataset, training_dataset_feature, transformation_function, ) +from hsfs.client.exceptions import FeatureStoreException from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics -from hsfs.core.transformation_function_engine import TransformationFunctionEngine from hsfs.engine import python, spark +from hsfs.hopsworks_udf import HopsworksUdf, UDFType, udf from pyspark.sql.types import ( BooleanType, DateType, DoubleType, IntegerType, + LongType, StringType, StructField, StructType, @@ -43,28 +45,13 @@ ) +# TODO : Remove skipping UT in windows after Greater expectations has been upgraded to 1.0 or after it has been made optional +@pytest.mark.skipif( + os.name == "nt", + reason="Skip tests in windows since it fails due to dependency problem with greater expectations 0.18.2, Fixed on upgrading to 1.0", +) class TestPythonSparkTransformationFunctions: - def _create_training_dataset( - self, tf_fun, output_type=None, name=None, col="col_0" - ): - if isinstance(tf_fun, str): - tf = transformation_function.TransformationFunction( - name=name, - featurestore_id=99, - transformation_fn=None, - source_code_content=tf_fun, - output_type=output_type, - ) - else: - tf = transformation_function.TransformationFunction( - featurestore_id=99, - transformation_fn=tf_fun, - builtin_source_code=None, - output_type=output_type, - ) - transformation_fn_dict = dict() - transformation_fn_dict[col] = tf - + def _create_training_dataset(self): f = training_dataset_feature.TrainingDatasetFeature( name="col_0", type=IntegerType(), index=0 ) @@ -83,18 +70,18 @@ def _create_training_dataset( featurestore_id=99, splits={}, features=features, - transformation_functions=transformation_fn_dict, ) return td - def _validate_on_python_engine(self, td, df, expected_df): + def _validate_on_python_engine(self, td, df, expected_df, transformation_functions): # Arrange + engine._engine_type = "python" python_engine = python.Engine() # Act result = python_engine._apply_transformation_function( - transformation_functions=td.transformation_functions, + transformation_functions=transformation_functions, dataset=df, ) @@ -102,13 +89,16 @@ def _validate_on_python_engine(self, td, df, expected_df): assert list(result.dtypes) == list(expected_df.dtypes) assert result.equals(expected_df) - def _validate_on_spark_engine(self, td, spark_df, expected_spark_df): + def _validate_on_spark_engine( + self, td, spark_df, expected_spark_df, transformation_functions + ): # Arrange + engine._engine_type = "spark" spark_engine = spark.Engine() # Act result = spark_engine._apply_transformation_function( - transformation_functions=td.transformation_functions, + transformation_functions=transformation_functions, dataset=spark_df, ) @@ -116,9 +106,10 @@ def _validate_on_spark_engine(self, td, spark_df, expected_spark_df): assert result.schema == expected_spark_df.schema assert result.collect() == expected_spark_df.collect() - def test_apply_builtin_minmax(self, mocker): + def test_apply_builtin_minmax_from_backend(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") + mocker.patch("hsfs.core.statistics_engine.StatisticsEngine._save_statistics") spark_engine = spark.Engine() schema = StructType( @@ -139,16 +130,16 @@ def test_apply_builtin_minmax(self, mocker): expected_schema = StructType( [ - StructField("col_0", DoubleType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("min_max_scaler_col_0_", DoubleType(), True), ] ) expected_df = pd.DataFrame( data={ - "col_0": [0.5, 1.0], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "min_max_scaler_col_0_": [0.0, 1.0], } ) expected_spark_df = spark_engine._spark_session.createDataFrame( @@ -156,34 +147,49 @@ def test_apply_builtin_minmax(self, mocker): ) # Arrange - tf_fun = ( - '{"module_imports": "from datetime import datetime", "transformer_code": ' - '"def min_max_scaler(value, min_value,max_value):\\n if value is None:\\n ' - "return None\\n else:\\n try:\\n return (value - min_value) / (max_value - min_value)\\n" - ' except ZeroDivisionError:\\n return 0\\n"}' - ) - - td = self._create_training_dataset(tf_fun, "DOUBLE", "min_max_scaler") - - td.transformation_functions["col_0"] = ( - TransformationFunctionEngine.populate_builtin_fn_arguments( - "col_0", - td.transformation_functions["col_0"], - [ - FeatureDescriptiveStatistics( - feature_name="col_0", feature_type="Integral", min=0, max=2 - ) - ], + tf_fun_source = ( + "import numpy as np\nimport pandas as pd\nfrom hsfs.transformation_statistics import TransformationStatistics\n" + "from hsfs.hopsworks_udf import udf\n" + 'feature_statistics = TransformationStatistics("feature")\n' + "@udf(float)\n" + "def min_max_scaler(feature: pd.Series, statistics = feature_statistics) -> pd.Series:\n" + " return (feature - statistics.feature.min) / (statistics.feature.max - statistics.feature.min)" + ) + udf_response = { + "sourceCode": tf_fun_source, + "outputTypes": ["double"], + "transformationFeatures": [], + "statisticsArgumentNames": ["feature"], + "name": "min_max_scaler", + "droppedArgumentNames": ["feature"], + } + + tf_fun = HopsworksUdf.from_response_json(udf_response) + + td = self._create_training_dataset() + + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=tf_fun("col_0"), + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, ) - ) + ] + + transformation_functions[0].hopsworks_udf.transformation_statistics = [ + FeatureDescriptiveStatistics(feature_name="col_0", min=1, max=2) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine(td, df, expected_df, transformation_functions) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_builtin_labelencoder(self, mocker): + def test_apply_builtin_minmax(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") + mocker.patch("hsfs.core.statistics_engine.StatisticsEngine._save_statistics") spark_engine = spark.Engine() schema = StructType( @@ -204,53 +210,49 @@ def test_apply_builtin_labelencoder(self, mocker): expected_schema = StructType( [ - StructField("col_0", IntegerType(), True), - StructField("col_1", IntegerType(), True), + StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("min_max_scaler_col_0_", DoubleType(), True), ] ) expected_df = pd.DataFrame( data={ - "col_0": [1, 2], - "col_1": [0, 1], + "col_1": ["test_1", "test_2"], "col_2": [True, False], + "min_max_scaler_col_0_": [0.0, 1.0], } ) expected_spark_df = spark_engine._spark_session.createDataFrame( expected_df, schema=expected_schema ) - expected_df["col_1"] = expected_df["col_1"].astype(pd.Int32Dtype()) # Arrange - tf_fun = ( - '{"module_imports": "", "transformer_code": "# label encoder\\n' - "def label_encoder(value, value_to_index):\\n" - " # define a mapping of values to integers\\n" - ' return value_to_index[value]"}' - ) + from hsfs.builtin_transformations import min_max_scaler - td = self._create_training_dataset(tf_fun, "INT", "label_encoder", "col_1") + td = self._create_training_dataset() - td.transformation_functions["col_1"] = ( - TransformationFunctionEngine.populate_builtin_fn_arguments( - "col_1", - td.transformation_functions["col_1"], - [ - FeatureDescriptiveStatistics( - feature_name="col_1", - extended_statistics={"unique_values": ["test_1", "test_2"]}, - ) - ], + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=min_max_scaler("col_0"), + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, ) - ) + ] + + transformation_functions[0].hopsworks_udf.transformation_statistics = [ + FeatureDescriptiveStatistics(feature_name="col_0", min=1, max=2) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine(td, df, expected_df, transformation_functions) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_builtin_standard_scaler(self, mocker): + def test_apply_builtin_standard_scaler_from_backend(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") + mocker.patch("hsfs.core.statistics_engine.StatisticsEngine._save_statistics") spark_engine = spark.Engine() schema = StructType( @@ -271,16 +273,16 @@ def test_apply_builtin_standard_scaler(self, mocker): expected_schema = StructType( [ - StructField("col_0", DoubleType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("standard_scaler_col_0_", DoubleType(), True), ] ) expected_df = pd.DataFrame( data={ - "col_0": [-1.0, 1.0], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "standard_scaler_col_0_": [-1.0, 1.0], } ) expected_spark_df = spark_engine._spark_session.createDataFrame( @@ -288,39 +290,50 @@ def test_apply_builtin_standard_scaler(self, mocker): ) # Arrange - tf_fun = ( - '{"module_imports": "from datetime import datetime", "transformer_code": "' - "def standard_scaler(value, mean, std_dev):\\n if value is None:\\n return None\\n " - "else:\\n try:\\n return (value - mean) / std_dev\\n except " - 'ZeroDivisionError:\\n return 0\\n"}' - ) - - td = self._create_training_dataset(tf_fun, "DOUBLE", "standard_scaler") - + tf_fun_source = ( + "import numpy as np\nimport pandas as pd\nfrom hsfs.transformation_statistics import TransformationStatistics\n" + "from hsfs.hopsworks_udf import udf\n" + 'feature_statistics = TransformationStatistics("feature")\n' + "@udf(float)\n" + "def standard_scaler(feature: pd.Series, statistics = feature_statistics) -> pd.Series:\n" + " return (feature - statistics.feature.mean) / statistics.feature.stddev" + ) + udf_response = { + "sourceCode": tf_fun_source, + "outputTypes": ["double"], + "transformationFeatures": [], + "statisticsArgumentNames": ["feature"], + "name": "standard_scaler", + "droppedArgumentNames": ["feature"], + } + + tf_fun = HopsworksUdf.from_response_json(udf_response) + + td = self._create_training_dataset() + + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=tf_fun("col_0"), + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, + ) + ] mean = statistics.mean([1, 2]) stddev = statistics.pstdev([1, 2]) - td.transformation_functions["col_0"] = ( - TransformationFunctionEngine.populate_builtin_fn_arguments( - "col_0", - td.transformation_functions["col_0"], - [ - FeatureDescriptiveStatistics( - feature_name="col_0", - feature_type="Integral", - mean=mean, - stddev=stddev, - ) - ], - ) - ) + transformation_functions[0].hopsworks_udf.transformation_statistics = [ + FeatureDescriptiveStatistics(feature_name="col_0", mean=mean, stddev=stddev) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine(td, df, expected_df, transformation_functions) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_builtin_robustscaler(self, mocker): + def test_apply_builtin_standard_scaler(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") + mocker.patch("hsfs.core.statistics_engine.StatisticsEngine._save_statistics") spark_engine = spark.Engine() schema = StructType( @@ -341,16 +354,16 @@ def test_apply_builtin_robustscaler(self, mocker): expected_schema = StructType( [ - StructField("col_0", DoubleType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("standard_scaler_col_0_", DoubleType(), True), ] ) expected_df = pd.DataFrame( data={ - "col_0": [-1.0, 0.0], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "standard_scaler_col_0_": [-1.0, 1.0], } ) expected_spark_df = spark_engine._spark_session.createDataFrame( @@ -358,40 +371,34 @@ def test_apply_builtin_robustscaler(self, mocker): ) # Arrange - tf_fun = ( - '{"module_imports": "from datetime import datetime", "transformer_code": "' - "def robust_scaler(value, p25, p50, p75):\\n if value is None:\\n " - "return None\\n else:\\n try:\\n return (value - p50) / (p75 - p25)\\n " - 'except ZeroDivisionError:\\n return 0\\n"}\n' - ) + from hsfs.builtin_transformations import standard_scaler - td = self._create_training_dataset(tf_fun, "DOUBLE", "robust_scaler") + td = self._create_training_dataset() - percentiles = [1] * 100 - percentiles[24] = 1 - percentiles[49] = 2 - percentiles[74] = 2 - td.transformation_functions["col_0"] = ( - TransformationFunctionEngine.populate_builtin_fn_arguments( - "col_0", - td.transformation_functions["col_0"], - [ - FeatureDescriptiveStatistics( - feature_name="col_0", - feature_type="Integral", - percentiles=percentiles, - ) - ], + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=standard_scaler("col_0"), + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, ) - ) + ] + + mean = statistics.mean([1, 2]) + stddev = statistics.pstdev([1, 2]) + transformation_functions[0].hopsworks_udf.transformation_statistics = [ + FeatureDescriptiveStatistics(feature_name="col_0", mean=mean, stddev=stddev) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine(td, df, expected_df, transformation_functions) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_plus_one_int(self, mocker): + def test_apply_builtin_robust_scaler_from_backend(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") + mocker.patch("hsfs.core.statistics_engine.StatisticsEngine._save_statistics") spark_engine = spark.Engine() schema = StructType( @@ -412,36 +419,70 @@ def test_apply_plus_one_int(self, mocker): expected_schema = StructType( [ - StructField("col_0", IntegerType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("robust_scaler_col_0_", DoubleType(), True), ] ) expected_df = pd.DataFrame( data={ - "col_0": [2, 3], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "robust_scaler_col_0_": [-1.0, 0], } ) expected_spark_df = spark_engine._spark_session.createDataFrame( expected_df, schema=expected_schema ) - expected_df["col_0"] = expected_df["col_0"].astype(pd.Int32Dtype()) # Arrange - def tf_fun(a) -> int: - return a + 1 - - td = self._create_training_dataset(tf_fun, "int") + tf_fun_source = ( + "import numpy as np\nimport pandas as pd\nfrom hsfs.transformation_statistics import TransformationStatistics\n" + "from hsfs.hopsworks_udf import udf\n" + 'feature_statistics = TransformationStatistics("feature")\n' + "@udf(float)\n" + "def robust_scaler(feature: pd.Series, statistics = feature_statistics) -> pd.Series:\n" + " return (feature - statistics.feature.percentiles[49]) / (statistics.feature.percentiles[74] - " + "statistics.feature.percentiles[24])" + ) + udf_response = { + "sourceCode": tf_fun_source, + "outputTypes": ["double"], + "transformationFeatures": [], + "statisticsArgumentNames": ["feature"], + "name": "robust_scaler", + "droppedArgumentNames": ["feature"], + } + + tf_fun = HopsworksUdf.from_response_json(udf_response) + + td = self._create_training_dataset() + + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=tf_fun("col_0"), + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, + ) + ] + percentiles = [1] * 100 + percentiles[24] = 1 + percentiles[49] = 2 + percentiles[74] = 2 + transformation_functions[0].hopsworks_udf.transformation_statistics = [ + FeatureDescriptiveStatistics(feature_name="col_0", percentiles=percentiles) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine(td, df, expected_df, transformation_functions) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_plus_one_str(self, mocker): + def test_apply_builtin_robust_scaler(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") + mocker.patch("hsfs.core.statistics_engine.StatisticsEngine._save_statistics") spark_engine = spark.Engine() schema = StructType( @@ -462,16 +503,16 @@ def test_apply_plus_one_str(self, mocker): expected_schema = StructType( [ - StructField("col_0", StringType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("robust_scaler_col_0_", DoubleType(), True), ] ) expected_df = pd.DataFrame( data={ - "col_0": ["2", "3"], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "robust_scaler_col_0_": [-1.0, 0], } ) expected_spark_df = spark_engine._spark_session.createDataFrame( @@ -479,16 +520,33 @@ def test_apply_plus_one_str(self, mocker): ) # Arrange - def tf_fun(a) -> int: - return a + 1 + from hsfs.builtin_transformations import robust_scaler - td = self._create_training_dataset(tf_fun, "string") + td = self._create_training_dataset() + + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=robust_scaler("col_0"), + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, + ) + ] + + percentiles = [1] * 100 + percentiles[24] = 1 + percentiles[49] = 2 + percentiles[74] = 2 + transformation_functions[0].hopsworks_udf.transformation_statistics = [ + FeatureDescriptiveStatistics(feature_name="col_0", percentiles=percentiles) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine(td, df, expected_df, transformation_functions) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_plus_one_double(self, mocker): + def test_apply_plus_one_int(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") spark_engine = spark.Engine() @@ -507,96 +565,107 @@ def test_apply_plus_one_double(self, mocker): "col_2": [True, False], } ) + spark_df = spark_engine._spark_session.createDataFrame(df, schema=schema) expected_schema = StructType( [ - StructField("col_0", DoubleType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("tf_fun_col_0_", LongType(), True), ] ) expected_df = pd.DataFrame( data={ - "col_0": [2.0, 3.0], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "tf_fun_col_0_": [2, 3], } ) expected_spark_df = spark_engine._spark_session.createDataFrame( expected_df, schema=expected_schema ) - spark_df = spark_engine._spark_session.createDataFrame(df, schema=schema) # Arrange - def tf_fun(a) -> np.float64: - return a + 1.0 + @udf(int, drop=["col_0"]) + def tf_fun(col_0): + return col_0 + 1 + + td = self._create_training_dataset() - td = self._create_training_dataset(tf_fun, "double") + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=tf_fun, + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, + ) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine(td, df, expected_df, transformation_functions) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_plus_one_datetime_no_tz(self, mocker): + def test_apply_plus_one_str(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") spark_engine = spark.Engine() schema = StructType( [ - StructField("col_0", IntegerType(), True), + StructField("col_0", StringType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), ] ) df = pd.DataFrame( data={ - "col_0": [1640995200, 1640995201], + "col_0": ["1", "2"], "col_1": ["test_1", "test_2"], "col_2": [True, False], } ) - spark_df = spark_engine._spark_session.createDataFrame(df, schema=schema) expected_schema = StructType( [ - StructField("col_0", TimestampType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("tf_fun_col_0_", StringType(), True), ] ) expected_df = pd.DataFrame( data={ - "col_0": [ - datetime.datetime.utcfromtimestamp(1640995201), - datetime.datetime.utcfromtimestamp(1640995202), - ], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "tf_fun_col_0_": ["11", "21"], } ) - # convert timestamps to current timezone - local_tz = tzlocal.get_localzone() - expected_df_localized = expected_df.copy(True) - expected_df_localized["col_0"] = expected_df_localized["col_0"].dt.tz_localize( - str(local_tz) - ) expected_spark_df = spark_engine._spark_session.createDataFrame( - expected_df_localized, schema=expected_schema + expected_df, schema=expected_schema ) # Arrange - def tf_fun(a) -> datetime.datetime: - return datetime.datetime.utcfromtimestamp(a + 1) - - td = self._create_training_dataset(tf_fun, "datetime") + @udf(str, drop="col_0") + def tf_fun(col_0): + return col_0 + "1" + + td = self._create_training_dataset() + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=tf_fun, + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, + ) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine(td, df, expected_df, transformation_functions) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_plus_one_datetime_tz_utc(self, mocker): + def test_apply_plus_one_double(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") spark_engine = spark.Engine() @@ -610,127 +679,147 @@ def test_apply_plus_one_datetime_tz_utc(self, mocker): ) df = pd.DataFrame( data={ - "col_0": [1640995200, 1640995201], + "col_0": [1, 2], "col_1": ["test_1", "test_2"], "col_2": [True, False], } ) - spark_df = spark_engine._spark_session.createDataFrame(df, schema=schema) expected_schema = StructType( [ - StructField("col_0", TimestampType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("tf_fun_col_0_", DoubleType(), True), ] ) expected_df = pd.DataFrame( data={ - "col_0": [ - datetime.datetime.utcfromtimestamp(1640995201), - datetime.datetime.utcfromtimestamp(1640995202), - ], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "tf_fun_col_0_": [2.0, 3.0], } ) - # convert timestamps to current timezone - local_tz = tzlocal.get_localzone() - expected_df_localized = expected_df.copy(True) - expected_df_localized["col_0"] = expected_df_localized["col_0"].dt.tz_localize( - str(local_tz) - ) expected_spark_df = spark_engine._spark_session.createDataFrame( - expected_df_localized, schema=expected_schema + expected_df, schema=expected_schema ) + spark_df = spark_engine._spark_session.createDataFrame(df, schema=schema) # Arrange - def tf_fun(a) -> datetime.datetime: - return datetime.datetime.utcfromtimestamp(a + 1).replace( - tzinfo=datetime.timezone.utc + @udf(float, drop="col_0") + def tf_fun(col_0): + return col_0 + 1.0 + + td = self._create_training_dataset() + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=tf_fun, + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, ) - - td = self._create_training_dataset(tf_fun, "datetime") + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine(td, df, expected_df, transformation_functions) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_plus_one_datetime_tz_pst(self, mocker): + def test_apply_plus_one_datetime_no_tz(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") spark_engine = spark.Engine() schema = StructType( [ - StructField("col_0", IntegerType(), True), + StructField("col_0", TimestampType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), ] ) df = pd.DataFrame( data={ - "col_0": [1640995200, 1640995201], + "col_0": [ + datetime.datetime.utcfromtimestamp(1640995200), + datetime.datetime.utcfromtimestamp(1640995201), + ], "col_1": ["test_1", "test_2"], "col_2": [True, False], } ) + spark_df = spark_engine._spark_session.createDataFrame(df, schema=schema) expected_schema = StructType( [ - StructField("col_0", TimestampType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("tf_fun_col_0_", TimestampType(), True), ] ) - expected_df = pd.DataFrame( data={ - "col_0": [ - datetime.datetime.utcfromtimestamp(1641024001), - datetime.datetime.utcfromtimestamp(1641024002), - ], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "tf_fun_col_0_": [ + datetime.datetime.utcfromtimestamp(1640995200) + + datetime.timedelta(milliseconds=1), + datetime.datetime.utcfromtimestamp(1640995201) + + datetime.timedelta(milliseconds=1), + ], } ) # convert timestamps to current timezone local_tz = tzlocal.get_localzone() expected_df_localized = expected_df.copy(True) - expected_df_localized["col_0"] = expected_df_localized["col_0"].dt.tz_localize( - str(local_tz) - ) + expected_df_localized["tf_fun_col_0_"] = expected_df_localized[ + "tf_fun_col_0_" + ].dt.tz_localize(str(local_tz)) expected_spark_df = spark_engine._spark_session.createDataFrame( expected_df_localized, schema=expected_schema ) # Arrange - def tf_fun(a) -> datetime.datetime: - pdt = pytz.timezone("US/Pacific") - return pdt.localize(datetime.datetime.utcfromtimestamp(a + 1)) + @udf(datetime.datetime, drop="col_0") + def tf_fun(col_0): + import datetime - td = self._create_training_dataset(tf_fun, "datetime") + return col_0 + datetime.timedelta(milliseconds=1) + + td = self._create_training_dataset() + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=tf_fun, + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, + ) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine( + td, df, expected_df_localized, transformation_functions + ) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_plus_one_datetime_ts_none(self, mocker): + def test_apply_plus_one_datetime_tz_utc(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") spark_engine = spark.Engine() schema = StructType( [ - StructField("col_0", IntegerType(), True), + StructField("col_0", TimestampType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), ] ) df = pd.DataFrame( data={ - "col_0": [1640995200, 1640995201], + "col_0": [ + datetime.datetime.utcfromtimestamp(1640995200), + datetime.datetime.utcfromtimestamp(1640995201), + ], "col_1": ["test_1", "test_2"], "col_2": [True, False], } @@ -739,59 +828,77 @@ def test_apply_plus_one_datetime_ts_none(self, mocker): expected_schema = StructType( [ - StructField("col_0", TimestampType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("tf_fun_col_0_", TimestampType(), True), ] ) - expected_df = pd.DataFrame( data={ - "col_0": [ - None, - datetime.datetime.utcfromtimestamp(1640995202), - ], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "tf_fun_col_0_": [ + datetime.datetime.utcfromtimestamp(1640995200) + + datetime.timedelta(milliseconds=1), + datetime.datetime.utcfromtimestamp(1640995201) + + datetime.timedelta(milliseconds=1), + ], } ) # convert timestamps to current timezone local_tz = tzlocal.get_localzone() expected_df_localized = expected_df.copy(True) - expected_df_localized["col_0"] = expected_df_localized["col_0"].dt.tz_localize( - str(local_tz) - ) + expected_df_localized["tf_fun_col_0_"] = expected_df_localized[ + "tf_fun_col_0_" + ].dt.tz_localize(str(local_tz)) expected_spark_df = spark_engine._spark_session.createDataFrame( expected_df_localized, schema=expected_schema ) # Arrange - def tf_fun(a) -> datetime.datetime: - return ( - None if a == 1640995200 else datetime.datetime.utcfromtimestamp(a + 1) + @udf(datetime.datetime, drop="col_0") + def tf_fun(col_0) -> datetime.datetime: + import datetime + + return (col_0 + datetime.timedelta(milliseconds=1)).dt.tz_localize( + datetime.timezone.utc ) - td = self._create_training_dataset(tf_fun, "datetime") + td = self._create_training_dataset() + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=tf_fun, + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, + ) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine( + td, df, expected_df_localized, transformation_functions + ) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_plus_one_date(self, mocker): + def test_apply_plus_one_datetime_tz_pst(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") spark_engine = spark.Engine() schema = StructType( [ - StructField("col_0", IntegerType(), True), + StructField("col_0", TimestampType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), ] ) df = pd.DataFrame( data={ - "col_0": [1641045600, 1641132000], + "col_0": [ + datetime.datetime.utcfromtimestamp(1640995200), + datetime.datetime.utcfromtimestamp(1640995201), + ], "col_1": ["test_1", "test_2"], "col_2": [True, False], } @@ -800,50 +907,79 @@ def test_apply_plus_one_date(self, mocker): expected_schema = StructType( [ - StructField("col_0", DateType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("tf_fun_col_0_", TimestampType(), True), ] ) + expected_df = pd.DataFrame( data={ - "col_0": [ - datetime.datetime.utcfromtimestamp(1641045601).date(), - datetime.datetime.utcfromtimestamp(1641132001).date(), - ], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "tf_fun_col_0_": [ + datetime.datetime.utcfromtimestamp(1640995200) + + datetime.timedelta(milliseconds=1), + datetime.datetime.utcfromtimestamp(1640995201) + + datetime.timedelta(milliseconds=1), + ], } ) + # convert timestamps to current timezone + local_tz = tzlocal.get_localzone() + expected_df_localized = expected_df.copy(True) + expected_df_localized["tf_fun_col_0_"] = expected_df_localized[ + "tf_fun_col_0_" + ].dt.tz_localize(str(local_tz)) expected_spark_df = spark_engine._spark_session.createDataFrame( - expected_df, schema=expected_schema + expected_df_localized, schema=expected_schema ) # Arrange - def tf_fun(a) -> datetime.datetime: - return datetime.datetime.utcfromtimestamp(a + 1) + @udf(datetime.datetime, drop="col_0") + def tf_fun(col_0) -> datetime.datetime: + import datetime - td = self._create_training_dataset(tf_fun, "date") + import pytz + + pdt = pytz.timezone("US/Pacific") + return (col_0 + datetime.timedelta(milliseconds=1)).dt.tz_localize(pdt) + + td = self._create_training_dataset() + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=tf_fun, + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, + ) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine( + td, df, expected_df_localized, transformation_functions + ) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_plus_one_no_type(self, mocker): + def test_apply_plus_one_datetime_ts_none(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") spark_engine = spark.Engine() schema = StructType( [ - StructField("col_0", IntegerType(), True), + StructField("col_0", TimestampType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), ] ) df = pd.DataFrame( data={ - "col_0": [1, 2], + "col_0": [ + datetime.datetime.utcfromtimestamp(1640995200), + datetime.datetime.utcfromtimestamp(1640995201), + ], "col_1": ["test_1", "test_2"], "col_2": [True, False], } @@ -852,47 +988,80 @@ def test_apply_plus_one_no_type(self, mocker): expected_schema = StructType( [ - StructField("col_0", StringType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("tf_fun_col_0_", TimestampType(), True), ] ) + expected_df = pd.DataFrame( data={ - "col_0": ["2", "3"], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "tf_fun_col_0_": [ + None, + datetime.datetime.utcfromtimestamp(1640995201) + + datetime.timedelta(milliseconds=1), + ], } ) + # convert timestamps to current timezone + local_tz = tzlocal.get_localzone() + expected_df_localized = expected_df.copy(True) + expected_df_localized["tf_fun_col_0_"] = expected_df_localized[ + "tf_fun_col_0_" + ].dt.tz_localize(str(local_tz)) expected_spark_df = spark_engine._spark_session.createDataFrame( - expected_df, schema=expected_schema + expected_df_localized, schema=expected_schema ) # Arrange - def tf_fun(a) -> int: - return a + 1 + @udf(datetime.datetime, drop=["col_0"]) + def tf_fun(col_0) -> datetime.datetime: + import datetime + + return pd.Series( + None + if data == datetime.datetime.utcfromtimestamp(1640995200) + else data + datetime.timedelta(milliseconds=1) + for data in col_0 + ) - td = self._create_training_dataset(tf_fun) + td = self._create_training_dataset() + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=tf_fun, + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, + ) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine( + td, df, expected_df_localized, transformation_functions + ) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_plus_one_empty_type(self, mocker): + def test_apply_plus_one_date(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") spark_engine = spark.Engine() schema = StructType( [ - StructField("col_0", IntegerType(), True), + StructField("col_0", DateType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), ] ) df = pd.DataFrame( data={ - "col_0": [1, 2], + "col_0": [ + datetime.datetime.utcfromtimestamp(1641045600).date(), + datetime.datetime.utcfromtimestamp(1641132000).date(), + ], "col_1": ["test_1", "test_2"], "col_2": [True, False], } @@ -901,16 +1070,21 @@ def test_apply_plus_one_empty_type(self, mocker): expected_schema = StructType( [ - StructField("col_0", StringType(), True), StructField("col_1", StringType(), True), StructField("col_2", BooleanType(), True), + StructField("tf_fun_col_0_", DateType(), True), ] ) expected_df = pd.DataFrame( data={ - "col_0": ["2", "3"], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "tf_fun_col_0_": [ + datetime.datetime.utcfromtimestamp(1641045600).date() + + datetime.timedelta(days=1), + datetime.datetime.utcfromtimestamp(1641132000).date() + + datetime.timedelta(days=1), + ], } ) expected_spark_df = spark_engine._spark_session.createDataFrame( @@ -918,26 +1092,39 @@ def test_apply_plus_one_empty_type(self, mocker): ) # Arrange - def tf_fun(a) -> int: - return a + 1 + @udf(datetime.date, drop=["col_0"]) + def tf_fun(col_0): + import datetime - td = self._create_training_dataset(tf_fun, "") + return col_0 + datetime.timedelta(days=1) + + td = self._create_training_dataset() + transformation_functions = [ + transformation_function.TransformationFunction( + hopsworks_udf=tf_fun, + featurestore_id=99, + transformation_type=UDFType.MODEL_DEPENDENT, + ) + ] # Assert - self._validate_on_python_engine(td, df, expected_df) - self._validate_on_spark_engine(td, spark_df, expected_spark_df) + self._validate_on_python_engine(td, df, expected_df, transformation_functions) + self._validate_on_spark_engine( + td, spark_df, expected_spark_df, transformation_functions + ) - def test_apply_plus_one_date_not_supported_type(self, mocker): + def test_apply_plus_one_invalid_type(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") # Arrange - def tf_fun(a) -> int: - return a + 1 + with pytest.raises(FeatureStoreException) as e_info: - # Act - with pytest.raises(TypeError) as e_info: - self._create_training_dataset(tf_fun, list) + @udf(list, drop="a") + def tf_fun(a): + return a + 1 - # Assert - assert str(e_info.value) == "Not supported type ." + assert ( + str(e_info.value) + == f"Output type {list} is not supported. Please refer to the documentation to get more information on the supported types." + ) diff --git a/python/tests/engine/test_spark.py b/python/tests/engine/test_spark.py index 5c7d76add0..5e31959ef4 100644 --- a/python/tests/engine/test_spark.py +++ b/python/tests/engine/test_spark.py @@ -23,6 +23,7 @@ expectation_suite, feature, feature_group, + feature_view, storage_connector, training_dataset, training_dataset_feature, @@ -33,6 +34,7 @@ from hsfs.constructor import hudi_feature_group_alias, query from hsfs.core import training_dataset_engine from hsfs.engine import spark +from hsfs.hopsworks_udf import UDFType, udf from hsfs.training_dataset_feature import TrainingDatasetFeature from pyspark.sql import DataFrame from pyspark.sql.types import ( @@ -467,7 +469,6 @@ def test_convert_to_default_dataframe_pyspark_rdd(self): # Assert result_df = result.toPandas() - print(result_df) assert list(result_df) == list(expected) for column in list(result_df): assert result_df[column].equals(result_df[column]) @@ -642,6 +643,51 @@ def test_save_dataframe(self, mocker): assert mock_spark_engine_save_online_dataframe.call_count == 0 assert mock_spark_engine_save_offline_dataframe.call_count == 1 + def test_save_dataframe_transformations(self, mocker): + # Arrange + mock_spark_engine_save_online_dataframe = mocker.patch( + "hsfs.engine.spark.Engine._save_online_dataframe" + ) + mock_spark_engine_save_offline_dataframe = mocker.patch( + "hsfs.engine.spark.Engine._save_offline_dataframe" + ) + mock_spark_engine_apply_transformations = mocker.patch( + "hsfs.engine.spark.Engine._apply_transformation_function" + ) + + spark_engine = spark.Engine() + + @udf(int) + def test(feature): + return feature + 1 + + fg = feature_group.FeatureGroup( + name="test", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + id=10, + transformation_functions=[test], + ) + + # Act + spark_engine.save_dataframe( + feature_group=fg, + dataframe=None, + operation=None, + online_enabled=None, + storage=None, + offline_write_options=None, + online_write_options=None, + validation_id=None, + ) + + # Assert + assert mock_spark_engine_save_online_dataframe.call_count == 0 + assert mock_spark_engine_save_offline_dataframe.call_count == 1 + assert mock_spark_engine_apply_transformations.call_count == 1 + def test_save_dataframe_storage_offline(self, mocker): # Arrange mock_spark_engine_save_online_dataframe = mocker.patch( @@ -977,6 +1023,135 @@ def test_save_stream_dataframe(self, mocker, backend_fixtures): == 0 ) + def test_save_stream_dataframe_transformations(self, mocker, backend_fixtures): + # Arrange + mock_client_get_instance = mocker.patch("hsfs.client.get_instance") + mocker.patch("hsfs.engine.spark.Engine._encode_complex_features") + mock_spark_engine_online_fg_to_avro = mocker.patch( + "hsfs.engine.spark.Engine._online_fg_to_avro" + ) + + mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") + mock_engine_get_instance.return_value.add_file.return_value = ( + "result_from_add_file" + ) + + mock_storage_connector_api = mocker.patch( + "hsfs.core.storage_connector_api.StorageConnectorApi" + ) + + mock_spark_engine_apply_transformations = mocker.patch( + "hsfs.engine.spark.Engine._apply_transformation_function" + ) + + json = backend_fixtures["storage_connector"]["get_kafka_external"]["response"] + sc = storage_connector.StorageConnector.from_response_json(json) + mock_storage_connector_api.return_value.get_kafka_connector.return_value = sc + + spark_engine = spark.Engine() + + @udf(int) + def test(feature): + return feature + 1 + + fg = feature_group.FeatureGroup( + name="test", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + id=10, + online_topic_name="test_online_topic_name", + transformation_functions=[test], + ) + fg.feature_store = mocker.Mock() + project_id = 1 + fg.feature_store.project_id = project_id + + mock_client_get_instance.return_value._project_name = "test_project_name" + + # Act + spark_engine.save_stream_dataframe( + feature_group=fg, + dataframe=None, + query_name=None, + output_mode="test_mode", + await_termination=None, + timeout=None, + checkpoint_dir=None, + write_options={"test_name": "test_value"}, + ) + + # Assert + assert ( + mock_spark_engine_online_fg_to_avro.return_value.withColumn.call_args[0][0] + == "headers" + ) + assert ( + mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.call_args[ + 0 + ][0] + == "test_mode" + ) + assert ( + mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.call_args[ + 0 + ][0] + == "kafka" + ) + assert ( + mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ + 0 + ][0] + == "checkpointLocation" + ) + assert ( + mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ + 0 + ][1] + == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}-checkpoint" + ) + assert ( + mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ + 1 + ] + == { + "kafka.bootstrap.servers": "test_bootstrap_servers", + "kafka.security.protocol": "test_security_protocol", + "kafka.ssl.endpoint.identification.algorithm": "test_ssl_endpoint_identification_algorithm", + "kafka.ssl.key.password": "test_ssl_key_password", + "kafka.ssl.keystore.location": "result_from_add_file", + "kafka.ssl.keystore.password": "test_ssl_keystore_password", + "kafka.ssl.truststore.location": "result_from_add_file", + "kafka.ssl.truststore.password": "test_ssl_truststore_password", + "kafka.test_option_name": "test_option_value", + "test_name": "test_value", + } + ) + assert ( + mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ + 0 + ][0] + == "topic" + ) + assert ( + mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ + 0 + ][1] + == "test_online_topic_name" + ) + assert ( + mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.call_args[ + 0 + ][0] + == self._get_spark_query_name(project_id, fg) + ) + assert ( + mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_count + == 0 + ) + assert mock_spark_engine_apply_transformations.call_count == 1 + def test_save_stream_dataframe_query_name(self, mocker, backend_fixtures): # Arrange mock_client_get_instance = mocker.patch("hsfs.client.get_instance") @@ -1729,9 +1904,6 @@ def test_write_training_dataset(self, mocker): mock_spark_engine_convert_to_default_dataframe = mocker.patch( "hsfs.engine.spark.Engine.convert_to_default_dataframe" ) - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions" - ) mock_spark_engine_write_training_dataset_single = mocker.patch( "hsfs.engine.spark.Engine._write_training_dataset_single" ) @@ -1806,7 +1978,24 @@ def test_write_training_dataset_to_df(self, mocker, backend_fixtures): statistics_config=None, training_dataset_type=training_dataset.TrainingDataset.IN_MEMORY, extra_filter=None, - transformation_functions={}, + ) + + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, + ) + + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), + featurestore_id=99, + transformation_functions=[], ) # Act @@ -1816,7 +2005,7 @@ def test_write_training_dataset_to_df(self, mocker, backend_fixtures): user_write_options={}, save_mode=training_dataset_engine.TrainingDatasetEngine.OVERWRITE, read_options={}, - feature_view_obj=None, + feature_view_obj=fv, to_df=True, ) @@ -1846,6 +2035,24 @@ def test_write_training_dataset_split_to_df(self, mocker, backend_fixtures): query_df = spark_engine._spark_session.createDataFrame(df) mock_query_read.side_effect = [query_df] + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, + ) + + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), + featurestore_id=99, + transformation_functions=[], + ) + td = training_dataset.TrainingDataset( name="test", version=None, @@ -1865,7 +2072,6 @@ def test_write_training_dataset_split_to_df(self, mocker, backend_fixtures): training_dataset_type=training_dataset.TrainingDataset.IN_MEMORY, extra_filter=None, seed=1, - transformation_functions={}, ) # Act @@ -1875,7 +2081,7 @@ def test_write_training_dataset_split_to_df(self, mocker, backend_fixtures): user_write_options={}, save_mode=training_dataset_engine.TrainingDatasetEngine.OVERWRITE, read_options={}, - feature_view_obj=None, + feature_view_obj=fv, to_df=True, ) @@ -1897,9 +2103,6 @@ def test_write_training_dataset_query(self, mocker): mock_spark_engine_convert_to_default_dataframe = mocker.patch( "hsfs.engine.spark.Engine.convert_to_default_dataframe" ) - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions" - ) mock_spark_engine_write_training_dataset_single = mocker.patch( "hsfs.engine.spark.Engine._write_training_dataset_single" ) @@ -1910,6 +2113,24 @@ def test_write_training_dataset_query(self, mocker): spark_engine = spark.Engine() + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, + ) + + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), + featurestore_id=99, + transformation_functions=[], + ) + td = training_dataset.TrainingDataset( name="test", version=1, @@ -1927,7 +2148,7 @@ def test_write_training_dataset_query(self, mocker): user_write_options=None, save_mode=None, read_options=None, - feature_view_obj=None, + feature_view_obj=fv, to_df=None, ) @@ -1948,9 +2169,6 @@ def test_write_training_dataset_query_coalesce(self, mocker): mock_spark_engine_convert_to_default_dataframe = mocker.patch( "hsfs.engine.spark.Engine.convert_to_default_dataframe" ) - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions" - ) mock_spark_engine_write_training_dataset_single = mocker.patch( "hsfs.engine.spark.Engine._write_training_dataset_single" ) @@ -1961,6 +2179,24 @@ def test_write_training_dataset_query_coalesce(self, mocker): spark_engine = spark.Engine() + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, + ) + + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), + featurestore_id=99, + transformation_functions=[], + ) + td = training_dataset.TrainingDataset( name="test", version=1, @@ -1979,7 +2215,7 @@ def test_write_training_dataset_query_coalesce(self, mocker): user_write_options=None, save_mode=None, read_options=None, - feature_view_obj=None, + feature_view_obj=fv, to_df=None, ) @@ -2000,9 +2236,6 @@ def test_write_training_dataset_td_splits(self, mocker): mock_spark_engine_convert_to_default_dataframe = mocker.patch( "hsfs.engine.spark.Engine.convert_to_default_dataframe" ) - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions" - ) mock_spark_engine_write_training_dataset_single = mocker.patch( "hsfs.engine.spark.Engine._write_training_dataset_single" ) @@ -2013,6 +2246,24 @@ def test_write_training_dataset_td_splits(self, mocker): spark_engine = spark.Engine() + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, + ) + + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), + featurestore_id=99, + transformation_functions=[], + ) + td = training_dataset.TrainingDataset( name="test", version=1, @@ -2034,7 +2285,7 @@ def test_write_training_dataset_td_splits(self, mocker): user_write_options=None, save_mode=None, read_options=None, - feature_view_obj=None, + feature_view_obj=fv, to_df=None, ) @@ -2056,9 +2307,6 @@ def test_write_training_dataset_td_splits_coalesce(self, mocker): mock_spark_engine_convert_to_default_dataframe = mocker.patch( "hsfs.engine.spark.Engine.convert_to_default_dataframe" ) - mocker.patch( - "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions" - ) mock_spark_engine_write_training_dataset_single = mocker.patch( "hsfs.engine.spark.Engine._write_training_dataset_single" ) @@ -2069,6 +2317,24 @@ def test_write_training_dataset_td_splits_coalesce(self, mocker): spark_engine = spark.Engine() + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, + ) + + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), + featurestore_id=99, + transformation_functions=[], + ) + td = training_dataset.TrainingDataset( name="test", version=1, @@ -2091,7 +2357,7 @@ def test_write_training_dataset_td_splits_coalesce(self, mocker): user_write_options=None, save_mode=None, read_options=None, - feature_view_obj=None, + feature_view_obj=fv, to_df=None, ) @@ -2575,20 +2841,16 @@ def test_write_training_dataset_splits(self, mocker): spark_engine = spark.Engine() - def plus_one(a) -> int: - return a + 1 + @udf(int) + def plus_one(col1): + return col1 + 1 tf = transformation_function.TransformationFunction( featurestore_id=99, - transformation_fn=plus_one, - builtin_source_code="", - output_type="int", + hopsworks_udf=plus_one, + transformation_type=UDFType.MODEL_DEPENDENT, ) - transformation_fn_dict = dict() - - transformation_fn_dict["col_0"] = tf - f = training_dataset_feature.TrainingDatasetFeature( name="col_0", type=IntegerType(), index=0 ) @@ -2603,7 +2865,6 @@ def plus_one(a) -> int: data_format="CSV", featurestore_id=99, splits={}, - transformation_functions=transformation_fn_dict, features=features, ) @@ -2614,6 +2875,7 @@ def plus_one(a) -> int: write_options=None, save_mode=None, to_df=False, + transformation_functions=[tf("col_0")], ) # Assert @@ -2629,14 +2891,14 @@ def test_write_training_dataset_splits_to_df(self, mocker): spark_engine = spark.Engine() - def plus_one(a) -> int: - return a + 1 + @udf(int) + def plus_one(col1): + return col1 + 1 tf = transformation_function.TransformationFunction( featurestore_id=99, - transformation_fn=plus_one, - builtin_source_code="", - output_type="int", + hopsworks_udf=plus_one, + transformation_type=UDFType.MODEL_DEPENDENT, ) transformation_fn_dict = dict() @@ -2668,6 +2930,7 @@ def plus_one(a) -> int: write_options=None, save_mode=None, to_df=True, + transformation_functions=[tf("col_0")], ) # Assert @@ -3621,6 +3884,81 @@ def test_parse_schema_feature_group(self, mocker): assert mock_spark_engine_convert_spark_type.call_count == 2 assert mock_spark_engine_convert_spark_type.call_args[0][1] is False + def test_parse_schema_feature_group_transformations(self, mocker): + # Arrange + mock_spark_engine_convert_spark_type = mocker.patch( + "hsfs.engine.spark.Engine.convert_spark_type_to_offline_type" + ) + + spark_engine = spark.Engine() + + d = {"col_0": [1, 2], "col_1": ["test_1", "test_2"]} + df = pd.DataFrame(data=d) + + @udf(int) + def test(feature): + return feature + 1 + + tf_function = transformation_function.TransformationFunction( + featurestore_id=10, + hopsworks_udf=test, + version=1, + transformation_type=UDFType.ON_DEMAND, + ) + + spark_df = spark_engine._spark_session.createDataFrame(df) + + # Act + result = spark_engine.parse_schema_feature_group( + dataframe=spark_df, + time_travel_format=None, + transformation_functions=[tf_function], + ) + + # Assert + assert result[0].name == "col_0" + assert result[1].name == "col_1" + assert result[2].name == "test" + assert mock_spark_engine_convert_spark_type.call_count == 2 + assert mock_spark_engine_convert_spark_type.call_args[0][1] is False + + def test_parse_schema_feature_group_transformations_dropped(self, mocker): + # Arrange + mock_spark_engine_convert_spark_type = mocker.patch( + "hsfs.engine.spark.Engine.convert_spark_type_to_offline_type" + ) + + spark_engine = spark.Engine() + + d = {"col_0": [1, 2], "col_1": ["test_1", "test_2"]} + df = pd.DataFrame(data=d) + + @udf(int, drop="feature") + def test(feature): + return feature + 1 + + tf_function = transformation_function.TransformationFunction( + featurestore_id=10, + hopsworks_udf=test("col_0"), + version=1, + transformation_type=UDFType.ON_DEMAND, + ) + + spark_df = spark_engine._spark_session.createDataFrame(df) + + # Act + result = spark_engine.parse_schema_feature_group( + dataframe=spark_df, + time_travel_format=None, + transformation_functions=[tf_function], + ) + + # Assert + assert result[0].name == "col_1" + assert result[1].name == "test" + assert mock_spark_engine_convert_spark_type.call_count == 2 + assert mock_spark_engine_convert_spark_type.call_args[0][1] is False + def test_parse_schema_feature_group_hudi(self, mocker): # Arrange mock_spark_engine_convert_spark_type = mocker.patch( @@ -4234,42 +4572,98 @@ def test_save_empty_dataframe(self, mocker): assert mock_spark_engine_save_dataframe.call_count == 1 assert mock_spark_table.call_count == 1 - def test_apply_transformation_function(self, mocker): + def test_apply_transformation_function_single_output(self, mocker): # Arrange mocker.patch("hsfs.client.get_instance") - + engine._engine_type = "spark" spark_engine = spark.Engine() - def plus_one(a) -> int: - return a + 1 + @udf(int, drop=["col1"]) + def plus_one(col1): + return col1 + 1 tf = transformation_function.TransformationFunction( + 99, hopsworks_udf=plus_one, transformation_type=UDFType.MODEL_DEPENDENT + ) + + f = feature.Feature(name="col_0", type=IntegerType(), index=0) + f1 = feature.Feature(name="col_1", type=StringType(), index=1) + f2 = feature.Feature(name="col_2", type=BooleanType(), index=1) + features = [f, f1, f2] + fg1 = feature_group.FeatureGroup( + name="test1", + version=1, featurestore_id=99, - transformation_fn=plus_one, - builtin_source_code="", - output_type="long", + primary_key=[], + partition_key=[], + features=features, + id=11, + stream=False, + ) + fv = feature_view.FeatureView( + name="test", + featurestore_id=99, + query=fg1.select_all(), + transformation_functions=[tf("col_0")], ) - transformation_fn_dict = dict() + d = {"col_0": [1, 2], "col_1": ["test_1", "test_2"], "col_2": [True, False]} + df = pd.DataFrame(data=d) - transformation_fn_dict["col_0"] = tf + spark_df = spark_engine._spark_session.createDataFrame(df) - f = training_dataset_feature.TrainingDatasetFeature( - name="col_0", type=IntegerType(), index=0 + expected_df = pd.DataFrame( + data={ + "col_1": ["test_1", "test_2"], + "col_2": [True, False], + "plus_one_col_0_": [2, 3], + } + ) # todo why it doesnt return int? + + expected_spark_df = spark_engine._spark_session.createDataFrame(expected_df) + + # Act + result = spark_engine._apply_transformation_function( + transformation_functions=fv.transformation_functions, + dataset=spark_df, ) - f1 = training_dataset_feature.TrainingDatasetFeature( - name="col_1", type=StringType(), index=1 + # Assert + assert result.schema == expected_spark_df.schema + assert result.collect() == expected_spark_df.collect() + + def test_apply_transformation_function_multiple_output(self, mocker): + # Arrange + mocker.patch("hsfs.client.get_instance") + engine._engine_type = "spark" + spark_engine = spark.Engine() + + @udf([int, int], drop=["col1"]) + def plus_two(col1): + return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col1 + 2}) + + tf = transformation_function.TransformationFunction( + 99, hopsworks_udf=plus_two, transformation_type=UDFType.MODEL_DEPENDENT ) - features = [f, f1] - td = training_dataset.TrainingDataset( - name="test", + f = feature.Feature(name="col_0", type=IntegerType(), index=0) + f1 = feature.Feature(name="col_1", type=StringType(), index=1) + f2 = feature.Feature(name="col_2", type=BooleanType(), index=1) + features = [f, f1, f2] + fg1 = feature_group.FeatureGroup( + name="test1", version=1, - data_format="CSV", featurestore_id=99, - splits={}, + primary_key=[], + partition_key=[], features=features, - transformation_functions=transformation_fn_dict, + id=11, + stream=False, + ) + fv = feature_view.FeatureView( + name="test", + featurestore_id=99, + query=fg1.select_all(), + transformation_functions=[tf("col_0")], ) d = {"col_0": [1, 2], "col_1": ["test_1", "test_2"], "col_2": [True, False]} @@ -4279,9 +4673,10 @@ def plus_one(a) -> int: expected_df = pd.DataFrame( data={ - "col_0": [2, 3], "col_1": ["test_1", "test_2"], "col_2": [True, False], + "plus_two_col_0_0": [2, 3], + "plus_two_col_0_1": [3, 4], } ) # todo why it doesnt return int? @@ -4289,10 +4684,191 @@ def plus_one(a) -> int: # Act result = spark_engine._apply_transformation_function( - transformation_functions=td.transformation_functions, + transformation_functions=fv.transformation_functions, + dataset=spark_df, + ) + # Assert + assert result.schema == expected_spark_df.schema + assert result.collect() == expected_spark_df.collect() + + def test_apply_transformation_function_multiple_input_output(self, mocker): + # Arrange + mocker.patch("hsfs.client.get_instance") + engine._engine_type = "spark" + spark_engine = spark.Engine() + + @udf([int, int]) + def test(col1, col2): + return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2}) + + tf = transformation_function.TransformationFunction( + 99, hopsworks_udf=test, transformation_type=UDFType.MODEL_DEPENDENT + ) + + f = feature.Feature(name="col_0", type=IntegerType(), index=0) + f1 = feature.Feature(name="col_1", type=StringType(), index=1) + f2 = feature.Feature(name="col_2", type=IntegerType(), index=1) + features = [f, f1, f2] + fg1 = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=features, + id=11, + stream=False, + ) + fv = feature_view.FeatureView( + name="test", + featurestore_id=99, + query=fg1.select_all(), + transformation_functions=[tf("col_0", "col_2")], + ) + + d = {"col_0": [1, 2], "col_1": ["test_1", "test_2"], "col_2": [10, 11]} + df = pd.DataFrame(data=d) + + spark_df = spark_engine._spark_session.createDataFrame(df) + + expected_df = pd.DataFrame( + data={ + "col_0": [1, 2], + "col_1": ["test_1", "test_2"], + "col_2": [10, 11], + "test_col_0_col_2_0": [2, 3], + "test_col_0_col_2_1": [12, 13], + } + ) + + expected_spark_df = spark_engine._spark_session.createDataFrame(expected_df) + + # Act + result = spark_engine._apply_transformation_function( + transformation_functions=fv.transformation_functions, dataset=spark_df, ) + # Assert + assert result.schema == expected_spark_df.schema + assert result.collect() == expected_spark_df.collect() + def test_apply_transformation_function_multiple_input_output_drop_some( + self, mocker + ): + # Arrange + mocker.patch("hsfs.client.get_instance") + engine._engine_type = "spark" + spark_engine = spark.Engine() + + @udf([int, int], drop=["col1"]) + def test(col1, col2): + return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2}) + + tf = transformation_function.TransformationFunction( + 99, hopsworks_udf=test, transformation_type=UDFType.MODEL_DEPENDENT + ) + + f = feature.Feature(name="col_0", type=IntegerType(), index=0) + f1 = feature.Feature(name="col_1", type=StringType(), index=1) + f2 = feature.Feature(name="col_2", type=IntegerType(), index=1) + features = [f, f1, f2] + fg1 = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=features, + id=11, + stream=False, + ) + fv = feature_view.FeatureView( + name="test", + featurestore_id=99, + query=fg1.select_all(), + transformation_functions=[tf("col_0", "col_2")], + ) + + d = {"col_0": [1, 2], "col_1": ["test_1", "test_2"], "col_2": [10, 11]} + df = pd.DataFrame(data=d) + + spark_df = spark_engine._spark_session.createDataFrame(df) + + expected_df = pd.DataFrame( + data={ + "col_1": ["test_1", "test_2"], + "col_2": [10, 11], + "test_col_0_col_2_0": [2, 3], + "test_col_0_col_2_1": [12, 13], + } + ) + + expected_spark_df = spark_engine._spark_session.createDataFrame(expected_df) + + # Act + result = spark_engine._apply_transformation_function( + transformation_functions=fv.transformation_functions, + dataset=spark_df, + ) + # Assert + assert result.schema == expected_spark_df.schema + assert result.collect() == expected_spark_df.collect() + + def test_apply_transformation_function_multiple_input_output_drop_all(self, mocker): + # Arrange + mocker.patch("hsfs.client.get_instance") + engine._engine_type = "spark" + spark_engine = spark.Engine() + + @udf([int, int], drop=["col1", "col2"]) + def test(col1, col2): + return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2}) + + tf = transformation_function.TransformationFunction( + 99, hopsworks_udf=test, transformation_type=UDFType.MODEL_DEPENDENT + ) + + f = feature.Feature(name="col_0", type=IntegerType(), index=0) + f1 = feature.Feature(name="col_1", type=StringType(), index=1) + f2 = feature.Feature(name="col_2", type=IntegerType(), index=1) + features = [f, f1, f2] + fg1 = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=features, + id=11, + stream=False, + ) + fv = feature_view.FeatureView( + name="test", + featurestore_id=99, + query=fg1.select_all(), + transformation_functions=[tf("col_0", "col_2")], + ) + + d = {"col_0": [1, 2], "col_1": ["test_1", "test_2"], "col_2": [10, 11]} + df = pd.DataFrame(data=d) + + spark_df = spark_engine._spark_session.createDataFrame(df) + + expected_df = pd.DataFrame( + data={ + "col_1": ["test_1", "test_2"], + "test_col_0_col_2_0": [2, 3], + "test_col_0_col_2_1": [12, 13], + } + ) # todo why it doesnt return int? + + expected_spark_df = spark_engine._spark_session.createDataFrame(expected_df) + + # Act + result = spark_engine._apply_transformation_function( + transformation_functions=fv.transformation_functions, + dataset=spark_df, + ) # Assert assert result.schema == expected_spark_df.schema assert result.collect() == expected_spark_df.collect() diff --git a/python/tests/fixtures/backend_fixtures.py b/python/tests/fixtures/backend_fixtures.py index 34a2c9e594..5a7029172f 100644 --- a/python/tests/fixtures/backend_fixtures.py +++ b/python/tests/fixtures/backend_fixtures.py @@ -56,7 +56,6 @@ "training_dataset_feature", "training_dataset", "training_dataset_split", - "transformation_function_attached", "transformation_function", "user", "validation_report", diff --git a/python/tests/fixtures/feature_fixtures.json b/python/tests/fixtures/feature_fixtures.json index 1d486c0cc4..c9b330768e 100644 --- a/python/tests/fixtures/feature_fixtures.json +++ b/python/tests/fixtures/feature_fixtures.json @@ -9,6 +9,22 @@ "partition": false, "primary": true, "type": "int", + "on_demand": false, + "description": "test_description", + "feature_group": null + } + }, + "get_on_demand": { + "response": { + "defaultValue": "1", + "featureGroupId": 15, + "hudiPrecombineKey": true, + "name": "intt", + "onlineType": "int", + "partition": false, + "primary": true, + "type": "int", + "on_demand": true, "description": "test_description", "feature_group": null } diff --git a/python/tests/fixtures/feature_group_fixtures.json b/python/tests/fixtures/feature_group_fixtures.json index 484a9e288d..bc967508b0 100644 --- a/python/tests/fixtures/feature_group_fixtures.json +++ b/python/tests/fixtures/feature_group_fixtures.json @@ -630,5 +630,127 @@ "version": 1 }, "headers": null + }, + "get_transformations": { + "response": { + "type": "cachedFeaturegroupDTO", + "validation_type": "test_validation_type", + "created": "2022-08-01T11:07:55Z", + "creator": { + "email": "admin@hopsworks.ai", + "firstName": "Admin", + "lastName": "Admin", + "maxNumProjects": 0, + "numActiveProjects": 0, + "numRemainingProjects": 0, + "status": 0, + "testUser": false, + "tos": false, + "toursState": 0, + "twoFactor": false + }, + "description": "test_description", + "featurestoreId": 67, + "featurestoreName": "test_featurestore", + "id": 15, + "location": "hopsfs://10.0.2.15:8020/apps/hive/warehouse/test_featurestore.db/fg_test_1", + "name": "fg_test", + "statisticsConfig": { + "columns": [], + "correlations": false, + "enabled": true, + "exactUniqueness": false, + "histograms": false + }, + "version": 1, + "features": [ + { + "defaultValue": null, + "featureGroupId": 15, + "hudiPrecombineKey": true, + "name": "intt", + "onlineType": "int", + "partition": false, + "primary": true, + "type": "int" + }, + { + "defaultValue": null, + "featureGroupId": 15, + "hudiPrecombineKey": false, + "name": "stringt", + "onlineType": "varchar(1000)", + "partition": false, + "primary": false, + "type": "string" + } + ], + "transformation_functions":[ + { + "id" : 1, + "version": 2, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@udf(float)\ndef add_two(data1 : pd.Series):\n return data1 + 2\n", + "name": "add_two", + "outputTypes":["double"], + "transformationFeatures":["data"], + "dropped_argument_names":["data1"] + } + }, + { + "id" : 2, + "version": 1, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", + "name": "add_one_fs", + "outputTypes":["double"], + "transformationFeatures":["col1"], + "dropped_argument_names":["data1"] + } + } + ], + "onlineTopicName": "119_15_fg_test_1_onlinefs", + "onlineEnabled": true, + "timeTravelFormat": "HUDI", + "expectationSuite": { + "expectation_suite_name": "test_expectation_suite_name", + "expectations": [ + { + "expectation_type": "1", + "kwargs": "{ \"kwargs_key\": \"kwargs_value\" }", + "meta": "{ \"meta_key\": \"meta_value\" }", + "id": 32 + } + ], + "meta": "{ \"great_expectations_version\": \"0.15.12\", \"key\": \"value\" }", + "id": 21, + "data_asset_type": "test_data_asset_type", + "ge_cloud_id": "test_ge_cloud_id", + "run_validation": "test_run_validation", + "validation_ingestion_policy": "test_validation_ingestion_policy", + "feature_store_id": 67, + "feature_group_id": 15, + "href": "test_/featurestores/67/featuregroups/15/expectationsuite", + "expand": "test_expand", + "items": "test_items", + "type": "expectationSuiteDTO", + "created": "test_created" + } + }, + "method": "GET", + "path_params": [ + "project", + "119", + "featurestores", + 67, + "featuregroups", + "fg_test" + ], + "query_params": { + "version": 1 + }, + "headers": null } } diff --git a/python/tests/fixtures/feature_view_fixtures.json b/python/tests/fixtures/feature_view_fixtures.json index aabf2bf9f6..260cffd0c9 100644 --- a/python/tests/fixtures/feature_view_fixtures.json +++ b/python/tests/fixtures/feature_view_fixtures.json @@ -159,9 +159,7 @@ "id": 11, "version": 1, "description": "test_description", - "transformation_functions": { - "featurestore_id": 5 - }, + "transformation_functions": {}, "features": [ { "name": "intt", @@ -686,9 +684,31 @@ "id": 11, "version": 1, "description": "test_description", - "transformation_functions": { - "featurestore_id": 5 - }, + "transformation_functions": [ + { + "id" : 1, + "version": 2, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n return data1 + statistics.data1.mean\n", + "name": "add_mean_fs", + "outputTypes":["double"], + "transformationFeatures":["data"], + "statisticsArgumentNames":["data1"] + } + }, + { + "id" : 2, + "version": 1, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", + "name": "add_one_fs", + "outputTypes":["double"], + "transformationFeatures":["col1"] + } + } + ], "features": [ { "name": "intt", @@ -743,5 +763,236 @@ } ] } + }, + "get_transformations": { + "response": { + "name": "test_name", + "query": { + "left_feature_group": { + "type": "cachedFeaturegroupDTO", + "validation_type": "test_validation_type", + "created": "2022-08-01T11:07:55Z", + "creator": { + "email": "admin@hopsworks.ai", + "firstName": "Admin", + "lastName": "Admin", + "maxNumProjects": 0, + "numActiveProjects": 0, + "numRemainingProjects": 0, + "status": 0, + "testUser": false, + "tos": false, + "toursState": 0, + "twoFactor": false + }, + "description": "test_description", + "featurestoreId": 67, + "featurestoreName": "test_featurestore", + "id": 15, + "location": "hopsfs://10.0.2.15:8020/apps/hive/warehouse/test_featurestore.db/fg_test_1", + "name": "fg_test", + "statisticsConfig": { + "columns": [], + "correlations": false, + "enabled": true, + "exactUniqueness": false, + "histograms": false + }, + "version": 1, + "features": [ + { + "defaultValue": null, + "featureGroupId": 15, + "hudiPrecombineKey": true, + "name": "intt", + "onlineType": "int", + "partition": false, + "primary": true, + "type": "int" + }, + { + "defaultValue": null, + "featureGroupId": 15, + "hudiPrecombineKey": false, + "name": "stringt", + "onlineType": "varchar(1000)", + "partition": false, + "primary": false, + "type": "string" + } + ], + "onlineTopicName": "119_15_fg_test_1_onlinefs", + "onlineEnabled": true, + "timeTravelFormat": "HUDI" + }, + "left_features": ["intt"], + "feature_store_name": "test_feature_store_name", + "feature_store_id": 67, + "left_feature_group_start_time": "test_start_time", + "left_feature_group_end_time": "test_end_time", + "joins": [ + { + "query": { + "left_feature_group": { + "type": "cachedFeaturegroupDTO", + "validation_type": "test_validation_type", + "created": "2022-08-01T11:07:55Z", + "creator": { + "email": "admin@hopsworks.ai", + "firstName": "Admin", + "lastName": "Admin", + "maxNumProjects": 0, + "numActiveProjects": 0, + "numRemainingProjects": 0, + "status": 0, + "testUser": false, + "tos": false, + "toursState": 0, + "twoFactor": false + }, + "description": "test_description", + "featurestoreId": 67, + "featurestoreName": "test_featurestore", + "id": 15, + "location": "hopsfs://10.0.2.15:8020/apps/hive/warehouse/test_featurestore.db/fg_test_1", + "name": "fg_test", + "statisticsConfig": { + "columns": [], + "correlations": false, + "enabled": true, + "exactUniqueness": false, + "histograms": false + }, + "version": 1, + "features": [ + { + "defaultValue": null, + "featureGroupId": 15, + "hudiPrecombineKey": true, + "name": "intt", + "onlineType": "int", + "partition": false, + "primary": true, + "type": "int" + }, + { + "defaultValue": null, + "featureGroupId": 15, + "hudiPrecombineKey": false, + "name": "stringt", + "onlineType": "varchar(1000)", + "partition": false, + "primary": false, + "type": "string" + } + ], + "onlineTopicName": "119_15_fg_test_1_onlinefs", + "onlineEnabled": true, + "timeTravelFormat": "HUDI" + }, + "left_features": ["intt"], + "feature_store_name": "test_feature_store_name", + "feature_store_id": 67, + "left_feature_group_start_time": "test_left_feature_group_start_time", + "left_feature_group_end_time": "test_left_feature_group_end_time", + "joins": [], + "filter": null + }, + "on": ["test_on"], + "left_on": ["test_left_on"], + "right_on": ["test_right_on"], + "join_type": "inner", + "prefix": "test_prefix" + } + ], + "filter": { + "condition": "test_condition", + "value": "test_value", + "feature": { + "defaultValue": null, + "featureGroupId": 15, + "hudiPrecombineKey": true, + "name": "intt", + "onlineType": "int", + "partition": false, + "primary": true, + "type": "int" + } + } + }, + "featurestore_id": 5, + "id": 11, + "version": 1, + "description": "test_description", + "transformation_functions": [ + { + "id" : 1, + "version": 2, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n return data1 + statistics.data1.mean\n", + "name": "add_mean_fs", + "outputTypes":["double"], + "transformationFeatures":["data"], + "statisticsArgumentNames":["data1"], + "dropped_argument_names":["data1"] + } + }, + { + "id" : 2, + "version": 1, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", + "name": "add_one_fs", + "outputTypes":["double"], + "transformationFeatures":["col1"], + "dropped_argument_names":["data1"] + } + } + ], + "features": [ + { + "name": "intt", + "label": "t", + "featuregroup": { + "type": "featuregroupDTO", + "featurestoreId": 67, + "version": 1, + "name": "fg_test", + "id": 15, + "statisticsConfig": { + "enabled": true, + "histograms": false, + "correlations": false, + "exactUniqueness": false, + "columns": [] + }, + "onlineEnabled": false, + "deprecated": false + } + }, + { + "name": "stringt", + "featurestoreId": 67, + "featuregroup": { + "type": "featuregroupDTO", + "featurestoreId": 67, + "version": 1, + "name": "fg_test", + "id": 15, + "statisticsConfig": { + "enabled": true, + "histograms": false, + "correlations": false, + "exactUniqueness": false, + "columns": [] + }, + "onlineEnabled": false, + "deprecated": false + } + } + ] + } } } diff --git a/python/tests/fixtures/training_dataset_feature_fixtures.json b/python/tests/fixtures/training_dataset_feature_fixtures.json index 19a958b970..0ca85653c8 100644 --- a/python/tests/fixtures/training_dataset_feature_fixtures.json +++ b/python/tests/fixtures/training_dataset_feature_fixtures.json @@ -62,27 +62,86 @@ "timeTravelFormat": "HUDI" }, "feature_group_feature_name": "test_feature_group_feature_name", - "label": "test_label", - "transformation_function": { - "count": 1, - "items": [ - { - "featurestore_id": 11, - "transformation_fn": null, - "version": 1, - "name": "test_name", - "source_code_content": "test_source_code_content", - "builtin_source_code": "test_builtin_source_code", - "output_type": "float", - "id": 43, - "type": "transformationFunctionTDO", - "items": [], - "count": 0, - "href": "test_href" + "label": "test_label" + } + }, + "get_transformations": { + "response": { + "name": "test_name", + "type": "test_type", + "index": "test_index", + "transformation_function": { + "id" : 2, + "version": 1, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", + "name": "add_one_fs", + "outputTypes":["double"], + "transformationFeatures":["col1"], + "dropped_argument_names":["data1"] } - ] + }, + "featuregroup": { + "type": "cachedFeaturegroupDTO", + "validation_type": "test_validation_type", + "created": "2022-08-01T11:07:55Z", + "creator": { + "email": "admin@hopsworks.ai", + "firstName": "Admin", + "lastName": "Admin", + "maxNumProjects": 0, + "numActiveProjects": 0, + "numRemainingProjects": 0, + "status": 0, + "testUser": false, + "tos": false, + "toursState": 0, + "twoFactor": false + }, + "description": "test_description", + "featurestoreId": 67, + "featurestoreName": "test_featurestore", + "id": 15, + "location": "hopsfs://10.0.2.15:8020/apps/hive/warehouse/test_featurestore.db/fg_test_1", + "name": "fg_test", + "statisticsConfig": { + "columns": [], + "correlations": false, + "enabled": true, + "exactUniqueness": false, + "histograms": false + }, + "version": 1, + "features": [ + { + "defaultValue": null, + "featureGroupId": 15, + "hudiPrecombineKey": true, + "name": "intt", + "onlineType": "int", + "partition": false, + "primary": true, + "type": "int" + }, + { + "defaultValue": null, + "featureGroupId": 15, + "hudiPrecombineKey": false, + "name": "stringt", + "onlineType": "varchar(1000)", + "partition": false, + "primary": false, + "type": "string" + } + ], + "onlineTopicName": "119_15_fg_test_1_onlinefs", + "onlineEnabled": true, + "timeTravelFormat": "HUDI" + }, + "feature_group_feature_name": "test_feature_group_feature_name", + "label": "test_label" } - } }, "get_fraud_online_training_dataset_features": { "response": [ diff --git a/python/tests/fixtures/training_dataset_fixtures.json b/python/tests/fixtures/training_dataset_fixtures.json index ea3f356e68..6db5d08325 100644 --- a/python/tests/fixtures/training_dataset_fixtures.json +++ b/python/tests/fixtures/training_dataset_fixtures.json @@ -122,21 +122,12 @@ "items": [ { "featurestore_id": 11, - "transformation_fn": "test_transformation_fn", "version": 1, "name": "test_name", - "source_code_content": "test_source_code_content", - "builtin_source_code": "test_builtin_source_code", - "output_type": "test_output_type", - "id": 43, - "type": "transformationFunctionTDO", - "items": [], - "count": 0, "href": "test_href" } ] - }, - "transformation_function": "test_transformation_function" + } } ], "statistics_config": { @@ -153,7 +144,6 @@ "from_query": "test_from_query", "querydto": "test_querydto", "label": "test_label", - "transformation_functions": "test_transformation_functions", "train_split": "test_train_split", "time_split_size": "test_time_split_size", "type": "trainingDatasetDTO" diff --git a/python/tests/fixtures/transformation_function_fixtures.json b/python/tests/fixtures/transformation_function_fixtures.json index 504671dffc..2604d5d75e 100644 --- a/python/tests/fixtures/transformation_function_fixtures.json +++ b/python/tests/fixtures/transformation_function_fixtures.json @@ -1,38 +1,111 @@ { - "get": { + "get_one_argument_no_statistics_function": { "response": { - "featurestore_id": 11, - "transformation_fn": null, - "version": 1, - "name": "test_name", - "source_code_content": "test_source_code_content", - "builtin_source_code": "test_builtin_source_code", - "output_type": "float", - "id": 43, - "type": "transformationFunctionTDO", - "href": "test_href" + "id" : 1, + "version": 2, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", + "name": "add_one_fs", + "outputTypes":["double"], + "transformationFeatures":["col1"], + "dropped_argument_names":["data1"] + } } }, - "get_basic_info": { + "get_one_argument_with_statistics_function": { "response": { - "featurestore_id": 11 + "id" : 1, + "version": 2, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n return data1 + statistics.data1.mean\n", + "name": "add_mean_fs", + "outputTypes":["double"], + "transformationFeatures":["data"], + "statisticsArgumentNames":["data1"], + "dropped_argument_names":["data1"] + } + } + }, + "get_multiple_argument_with_statistics_function": { + "response": { + "id" : 1, + "version": 2, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@udf(str)\ndef test_func(data1 : pd.Series, data2, data3, statistics=stats):\n return data1 + statistics.data1.mean\n", + "name": "test_func", + "outputTypes":["string"], + "transformationFeatures":["feature1", "feature2", "feature3"], + "statisticsArgumentNames":["data1", "data2"], + "dropped_argument_names":["data1", "data2", "data3"] + } + } + }, + "get_multiple_return_type_functions": { + "response": { + "id" : 1, + "version": 2, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@udf(str, float)\ndef test_func(data1 : pd.Series, data2, data3, statistics=stats):\n return pd.DataFrame('col1': ['a', 'b'], 'col2':[1,2])\n", + "name": "test_func", + "outputTypes":["string", "double"], + "transformationFeatures":["feature1", "feature2", "feature3"], + "statisticsArgumentNames":["data1", "data2"], + "dropped_argument_names":["data1", "data2", "data3"] + } } }, "get_list": { "response": { - "count": 1, + "count": 2, "items": [ { - "featurestore_id": 11, - "transformation_fn": null, + "id" : 1, + "version": 2, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n return data1 + statistics.data1.mean\n", + "name": "add_mean_fs", + "outputTypes":["double"], + "transformationFeatures":["data"], + "statisticsArgumentNames":["data1"], + "dropped_argument_names":["data1"] + } + }, + { + "id" : 2, "version": 1, - "name": "test_name", - "source_code_content": "test_source_code_content", - "builtin_source_code": "test_builtin_source_code", - "output_type": "float", - "id": 43, - "type": "transformationFunctionTDO", - "href": "test_href" + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n", + "name": "add_one_fs", + "outputTypes":["double"], + "transformationFeatures":["col1"], + "dropped_argument_names":["data1"] + } + } + ] + } + }, + "get_list_one_argument": { + "response": { + "count": 1, + "items": [ + { + "id" : 1, + "version": 2, + "featurestoreId": 11, + "hopsworksUdf":{ + "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n return data1 + statistics.data1.mean\n", + "name": "add_mean_fs", + "outputTypes":["double"], + "transformationFeatures":["data"], + "statisticsArgumentNames":["data1"], + "dropped_argument_names":["data1"] + } } ] } @@ -43,4 +116,4 @@ "items": [] } } -} \ No newline at end of file +} diff --git a/python/tests/pyproject.toml b/python/tests/pyproject.toml index 15a77ff4fd..050735f853 100644 --- a/python/tests/pyproject.toml +++ b/python/tests/pyproject.toml @@ -29,3 +29,9 @@ skip-magic-trailing-comma = false # Like Black, automatically detect the appropriate line ending. line-ending = "auto" + +[tool.pytest.ini_options] +pythonpath = [ + ".", "tests" +] +addopts = "--ignore=python/tests/test_helper/" diff --git a/python/tests/test_feature.py b/python/tests/test_feature.py index 8194035040..61ce72f288 100644 --- a/python/tests/test_feature.py +++ b/python/tests/test_feature.py @@ -36,6 +36,26 @@ def test_from_response_json(self, backend_fixtures): assert f.online_type == "int" assert f.default_value == "1" # default value should be specified as string assert f._feature_group_id == 15 + assert not f.on_demand + + def test_from_response_json_on_demand(self, backend_fixtures): + # Arrange + json = backend_fixtures["feature"]["get_on_demand"]["response"] + + # Act + f = feature.Feature.from_response_json(json) + + # Assert + assert f.name == "intt" + assert f.type == "int" + assert f.description == "test_description" + assert f.primary is True + assert f.partition is False + assert f.hudi_precombine_key is True + assert f.online_type == "int" + assert f.default_value == "1" # default value should be specified as string + assert f._feature_group_id == 15 + assert f.on_demand def test_from_response_json_basic_info(self, backend_fixtures): # Arrange diff --git a/python/tests/test_feature_group.py b/python/tests/test_feature_group.py index 56b870d23e..8e2ba67cdf 100644 --- a/python/tests/test_feature_group.py +++ b/python/tests/test_feature_group.py @@ -32,6 +32,7 @@ ) from hsfs.client.exceptions import FeatureStoreException, RestAPIError from hsfs.engine import python +from hsfs.hopsworks_udf import UDFType engine.init("python") @@ -145,7 +146,7 @@ def test_from_response_json_basic_info(self, backend_fixtures): assert fg._feature_store_id == 67 assert fg.description == "" assert fg.partition_key == [] - assert fg.primary_key == ['intt'] + assert fg.primary_key == ["intt"] assert fg.hudi_precombine_key is None assert fg._feature_store_name is None assert fg.created is None @@ -322,7 +323,7 @@ def test_constructor_with_list_event_time_for_compatibility( version=1, description="fg_description", event_time=["event_date"], - features=features + features=features, ) with pytest.raises(FeatureStoreException): util.verify_attribute_key_names(new_fg, False) @@ -885,3 +886,57 @@ def test_feature_group_save_expectation_suite_from_hopsworks_type( mock_print.call_args[0][0][:63] == "Updated expectation suite attached to Feature Group, edit it at" ) + + def test_from_response_json_transformation_functions(self, backend_fixtures): + # Arrange + json = backend_fixtures["feature_group"]["get_transformations"]["response"] + + # Act + fg = feature_group.FeatureGroup.from_response_json(json) + + # Assert + assert fg.name == "fg_test" + assert fg.version == 1 + assert fg._feature_store_id == 67 + assert fg.description == "test_description" + assert fg.partition_key == [] + assert fg.primary_key == ["intt"] + assert fg.hudi_precombine_key == "intt" + assert fg._feature_store_name == "test_featurestore" + assert fg.created == "2022-08-01T11:07:55Z" + assert len(fg.transformation_functions) == 2 + assert ( + fg.transformation_functions[0].hopsworks_udf.function_name == "add_one_fs" + ) + assert fg.transformation_functions[1].hopsworks_udf.function_name == "add_two" + assert ( + fg.transformation_functions[0].hopsworks_udf._function_source + == "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n" + ) + assert ( + fg.transformation_functions[1].hopsworks_udf._function_source + == "\n@udf(float)\ndef add_two(data1 : pd.Series):\n return data1 + 2\n" + ) + assert ( + fg.transformation_functions[0].hopsworks_udf.udf_type == UDFType.ON_DEMAND + ) + assert ( + fg.transformation_functions[1].hopsworks_udf.udf_type == UDFType.ON_DEMAND + ) + assert isinstance(fg.creator, user.User) + assert fg.id == 15 + assert len(fg.features) == 2 + assert isinstance(fg.features[0], feature.Feature) + assert ( + fg.location + == "hopsfs://10.0.2.15:8020/apps/hive/warehouse/test_featurestore.db/fg_test_1" + ) + assert fg.online_enabled is True + assert fg.time_travel_format == "HUDI" + assert isinstance(fg.statistics_config, statistics_config.StatisticsConfig) + assert fg._online_topic_name == "119_15_fg_test_1_onlinefs" + assert fg.event_time is None + assert fg.stream is False + assert ( + fg.expectation_suite.expectation_suite_name == "test_expectation_suite_name" + ) diff --git a/python/tests/test_feature_view.py b/python/tests/test_feature_view.py index 25a1cc6fbe..57aa5c1b93 100644 --- a/python/tests/test_feature_view.py +++ b/python/tests/test_feature_view.py @@ -15,9 +15,10 @@ # import warnings -from hsfs import feature_view, training_dataset_feature, transformation_function +from hsfs import feature_view, training_dataset_feature from hsfs.constructor import fs_query, query from hsfs.feature_store import FeatureStore +from hsfs.hopsworks_udf import UDFType, udf class TestFeatureView: @@ -32,7 +33,6 @@ def test_from_response_json(self, mocker, backend_fixtures): mocker.patch("hsfs.engine.get_type") mocker.patch("hsfs.core.feature_store_api.FeatureStoreApi.get") json = backend_fixtures["feature_view"]["get"]["response"] - # Act fv = feature_view.FeatureView.from_response_json(json) @@ -44,7 +44,7 @@ def test_from_response_json(self, mocker, backend_fixtures): assert fv.version == 1 assert fv.description == "test_description" assert fv.labels == ["intt"] - assert fv.transformation_functions == {} + assert fv.transformation_functions == [] assert len(fv.schema) == 2 assert isinstance(fv.schema[0], training_dataset_feature.TrainingDatasetFeature) @@ -65,10 +65,59 @@ def test_from_response_json_basic_info(self, mocker, backend_fixtures): assert fv.version is None assert fv.description is None assert fv.labels == [] - assert fv.transformation_functions == {} + assert fv.transformation_functions == [] assert len(fv.schema) == 0 assert fv.query._left_feature_group.deprecated is False + def test_from_response_json_transformation_function(self, mocker, backend_fixtures): + # Arrange + mocker.patch.object( + FeatureStore, + "project_id", + return_value=99, + ) + mocker.patch("hsfs.client.get_instance") + mocker.patch("hsfs.engine.get_type") + mocker.patch("hsfs.core.feature_store_api.FeatureStoreApi.get") + json = backend_fixtures["feature_view"]["get_transformations"]["response"] + # Act + fv = feature_view.FeatureView.from_response_json(json) + + # Assert + assert fv.name == "test_name" + assert fv.id == 11 + assert isinstance(fv.query, query.Query) + assert fv.featurestore_id == 5 + assert fv.version == 1 + assert fv.description == "test_description" + assert fv.labels == ["intt"] + assert len(fv.transformation_functions) == 2 + assert ( + fv.transformation_functions[0].hopsworks_udf.function_name == "add_mean_fs" + ) + assert ( + fv.transformation_functions[1].hopsworks_udf.function_name == "add_one_fs" + ) + assert ( + fv.transformation_functions[0].hopsworks_udf._function_source + == "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n return data1 + statistics.data1.mean\n" + ) + assert ( + fv.transformation_functions[1].hopsworks_udf._function_source + == "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n" + ) + assert ( + fv.transformation_functions[0].hopsworks_udf.udf_type + == UDFType.MODEL_DEPENDENT + ) + assert ( + fv.transformation_functions[1].hopsworks_udf.udf_type + == UDFType.MODEL_DEPENDENT + ) + + assert len(fv.schema) == 2 + assert isinstance(fv.schema[0], training_dataset_feature.TrainingDatasetFeature) + def test_from_response_json_basic_info_deprecated(self, mocker, backend_fixtures): # Arrange mocker.patch("hsfs.engine.get_type") @@ -87,7 +136,7 @@ def test_from_response_json_basic_info_deprecated(self, mocker, backend_fixtures assert fv.version is None assert fv.description is None assert fv.labels == [] - assert fv.transformation_functions == {} + assert fv.transformation_functions == [] assert len(fv.schema) == 0 assert fv.query._left_feature_group.deprecated is True assert len(warning_record) == 1 @@ -104,31 +153,18 @@ def test_transformation_function_instances(self, mocker, backend_fixtures): # Act q = fs_query.FsQuery.from_response_json(json) - def testFunction(): - print("Test") - - tf = transformation_function.TransformationFunction( - feature_store_id, - transformation_fn=testFunction, - builtin_source_code="", - output_type="str", - ) - - transformation_fn_dict = dict() - transformation_fn_dict["tf_name"] = tf - transformation_fn_dict["tf1_name"] = tf + @udf(int) + def test(col1): + return col1 + 1 fv = feature_view.FeatureView( featurestore_id=feature_store_id, name="test_fv", version=1, query=q, - transformation_functions=transformation_fn_dict, + transformation_functions=[test("data1"), test("data2")], ) - updated_transformation_fn_dict = fv.transformation_functions + transformation_functions = fv.transformation_functions - assert ( - updated_transformation_fn_dict["tf_name"] - != updated_transformation_fn_dict["tf1_name"] - ) + assert transformation_functions[0] != transformation_functions[1] diff --git a/python/tests/test_helpers/__init__.py b/python/tests/test_helpers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/tests/test_helpers/transformation_test_helper.py b/python/tests/test_helpers/transformation_test_helper.py new file mode 100644 index 0000000000..2a502692a1 --- /dev/null +++ b/python/tests/test_helpers/transformation_test_helper.py @@ -0,0 +1,81 @@ +import pandas as pd +from hsfs.transformation_statistics import TransformationStatistics + + +stats_arg1 = TransformationStatistics("arg1") +stats_arg1_arg3 = TransformationStatistics("arg1", "arg3") +stats_arg1_arg2 = TransformationStatistics("arg1", "arg2") +stats_arg3 = TransformationStatistics("arg3") + + +def test_function(): + return True + + +def test_function_one_argument(arg1): + pass + + +def test_function_one_argument_with_statistics(arg1, statistics=stats_arg1): + pass + + +def test_function_one_argument_with_typehints(arg1: pd.Series): + pass + + +def test_function_one_argument_with_statistics_and_typehints( + arg1: pd.Series, statistics=stats_arg1 +): + pass + + +def test_function_multiple_argument(arg1, arg2): + pass + + +def test_function_multiple_argument_with_statistics( + arg1, arg2, arg3, statistics=stats_arg1_arg3 +): + pass + + +def test_function_multiple_argument_with_typehints(arg1: pd.Series, arg2: pd.Series): + pass + + +def test_function_multiple_argument_with_statistics_and_typehints( + arg1: pd.Series, arg2: pd.Series, statistics=stats_arg1_arg2 +): + pass + + +def test_function_multiple_argument_with_mixed_statistics_and_typehints( + arg1: pd.Series, arg2, arg3, statistics=stats_arg1_arg3 +): + pass + + +def test_function_multiple_argument_all_parameter_with_spaces( + arg1: pd.Series, arg2, statistics=stats_arg1_arg2 +): + pass + + +def test_function_multiple_argument_all_parameter_multiline( + arg1: pd.Series, arg2, arg3, statistics=stats_arg1_arg3 +): + pass + + +def test_function_multiple_argument_all_parameter_multiline_with_comments( + arg1: pd.Series, # Test Comment + arg2, + arg3, # Test Comment + statistics=stats_arg1_arg3, # Test Comment +) -> pd.DataFrame: # Test Comment + pass + + +def test_function_statistics_invalid(arg1: pd.Series, statistics=stats_arg3): + pass diff --git a/python/tests/test_hopswork_udf.py b/python/tests/test_hopswork_udf.py new file mode 100644 index 0000000000..06ffb19742 --- /dev/null +++ b/python/tests/test_hopswork_udf.py @@ -0,0 +1,781 @@ +# +# Copyright 2024 Hopsworks AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from datetime import date, datetime, time + +import pandas as pd +import pytest +from hsfs.client.exceptions import FeatureStoreException +from hsfs.hopsworks_udf import HopsworksUdf, TransformationFeature, UDFType, udf + + +class TestHopsworksUdf: + def test_validate_and_convert_output_types_one_elements(self): + assert HopsworksUdf._validate_and_convert_output_types([int]) == ["bigint"] + + assert HopsworksUdf._validate_and_convert_output_types([float]) == ["double"] + + assert HopsworksUdf._validate_and_convert_output_types([str]) == ["string"] + + assert HopsworksUdf._validate_and_convert_output_types([bool]) == ["boolean"] + + assert HopsworksUdf._validate_and_convert_output_types([datetime]) == [ + "timestamp" + ] + + assert HopsworksUdf._validate_and_convert_output_types([time]) == ["timestamp"] + + assert HopsworksUdf._validate_and_convert_output_types([date]) == ["date"] + + with pytest.raises(FeatureStoreException) as exception: + HopsworksUdf._validate_and_convert_output_types([pd.DatetimeTZDtype]) + + assert ( + str(exception.value) + == f"Output type {pd.DatetimeTZDtype} is not supported. Please refer to the documentation to get more information on the supported types." + ) + + def test_validate_and_convert_output_types_multiple_types(self): + assert HopsworksUdf._validate_and_convert_output_types( + [int, float, str, bool, datetime, date, time] + ) == ["bigint", "double", "string", "boolean", "timestamp", "date", "timestamp"] + + assert HopsworksUdf._validate_and_convert_output_types( + ["bigint", "double", "string", "boolean", "timestamp", "date"] + ) == ["bigint", "double", "string", "boolean", "timestamp", "date"] + + with pytest.raises(FeatureStoreException) as exception: + HopsworksUdf._validate_and_convert_output_types([pd.DatetimeTZDtype]) + + assert ( + str(exception.value) + == f"Output type {pd.DatetimeTZDtype} is not supported. Please refer to the documentation to get more information on the supported types." + ) + + def test_validate_and_convert_output_types_invalid_types(self): + with pytest.raises(FeatureStoreException) as exception: + HopsworksUdf._validate_and_convert_output_types([pd.DatetimeTZDtype]) + + assert ( + str(exception.value) + == f"Output type {pd.DatetimeTZDtype} is not supported. Please refer to the documentation to get more information on the supported types." + ) + + with pytest.raises(FeatureStoreException) as exception: + HopsworksUdf._validate_and_convert_output_types([int, pd.DatetimeTZDtype]) + + assert ( + str(exception.value) + == f"Output type {pd.DatetimeTZDtype} is not supported. Please refer to the documentation to get more information on the supported types." + ) + + with pytest.raises(FeatureStoreException) as exception: + HopsworksUdf._validate_and_convert_output_types([int, "pd.DatetimeTZDtype"]) + + assert ( + str(exception.value) + == "Output type pd.DatetimeTZDtype is not supported. Please refer to the documentation to get more information on the supported types." + ) + + def test_get_module_imports(self): + assert HopsworksUdf._get_module_imports( + "python/tests/test_helpers/transformation_test_helper.py" + ) == [ + "import pandas as pd", + "from hsfs.transformation_statistics import TransformationStatistics", + ] + + def test_extract_source_code(self): + from test_helpers.transformation_test_helper import test_function + + assert """import pandas as pd +from hsfs.transformation_statistics import TransformationStatistics +def test_function(): + return True""" == HopsworksUdf._extract_source_code(test_function).strip() + + def test_extract_function_arguments_no_arguments(self): + from test_helpers.transformation_test_helper import test_function + + with pytest.raises(FeatureStoreException) as exception: + HopsworksUdf._extract_function_arguments(test_function) + + assert ( + str(exception.value) + == "No arguments present in the provided user defined function. Please provide at least one argument in the defined user defined function." + ) + + def test_extract_function_arguments_one_argument(self): + from test_helpers.transformation_test_helper import test_function_one_argument + + function_argument = HopsworksUdf._extract_function_arguments( + test_function_one_argument + ) + + assert function_argument == [ + TransformationFeature(feature_name="arg1", statistic_argument_name=None) + ] + + def test_extract_function_arguments_one_argument_with_statistics(self): + from test_helpers.transformation_test_helper import ( + test_function_one_argument_with_statistics, + ) + + function_argument = HopsworksUdf._extract_function_arguments( + test_function_one_argument_with_statistics + ) + + assert function_argument == [ + TransformationFeature(feature_name="arg1", statistic_argument_name="arg1") + ] + + def test_extract_function_arguments_one_argument_with_typehint(self): + from test_helpers.transformation_test_helper import ( + test_function_one_argument_with_typehints, + ) + + function_argument = HopsworksUdf._extract_function_arguments( + test_function_one_argument_with_typehints + ) + + assert function_argument == [ + TransformationFeature(feature_name="arg1", statistic_argument_name=None) + ] + + def test_extract_function_arguments_one_argument_with_statistics_and_typehints( + self, + ): + from test_helpers.transformation_test_helper import ( + test_function_one_argument_with_statistics_and_typehints, + ) + + function_argument = HopsworksUdf._extract_function_arguments( + test_function_one_argument_with_statistics_and_typehints + ) + + assert function_argument == [ + TransformationFeature(feature_name="arg1", statistic_argument_name="arg1") + ] + + def test_extract_function_arguments_multiple_argument(self): + from test_helpers.transformation_test_helper import ( + test_function_multiple_argument, + ) + + function_argument = HopsworksUdf._extract_function_arguments( + test_function_multiple_argument + ) + + assert function_argument == [ + TransformationFeature(feature_name="arg1", statistic_argument_name=None), + TransformationFeature(feature_name="arg2", statistic_argument_name=None), + ] + + def test_extract_function_arguments_multiple_argument_with_statistics(self): + from test_helpers.transformation_test_helper import ( + test_function_multiple_argument_with_statistics, + ) + + function_argument = HopsworksUdf._extract_function_arguments( + test_function_multiple_argument_with_statistics + ) + + assert function_argument == [ + TransformationFeature(feature_name="arg1", statistic_argument_name="arg1"), + TransformationFeature(feature_name="arg2", statistic_argument_name=None), + TransformationFeature(feature_name="arg3", statistic_argument_name="arg3"), + ] + + def test_extract_function_arguments_multiple_argument_with_typehints(self): + from test_helpers.transformation_test_helper import ( + test_function_multiple_argument_with_typehints, + ) + + function_argument = HopsworksUdf._extract_function_arguments( + test_function_multiple_argument_with_typehints + ) + + assert function_argument == [ + TransformationFeature(feature_name="arg1", statistic_argument_name=None), + TransformationFeature(feature_name="arg2", statistic_argument_name=None), + ] + + def test_extract_function_arguments_multiple_argument_with_statistics_and_typehints( + self, + ): + from test_helpers.transformation_test_helper import ( + test_function_multiple_argument_with_statistics_and_typehints, + ) + + function_argument = HopsworksUdf._extract_function_arguments( + test_function_multiple_argument_with_statistics_and_typehints + ) + + assert function_argument == [ + TransformationFeature(feature_name="arg1", statistic_argument_name="arg1"), + TransformationFeature(feature_name="arg2", statistic_argument_name="arg2"), + ] + + def test_extract_function_arguments_multiple_argument_with_mixed_statistics_and_typehints( + self, + ): + from test_helpers.transformation_test_helper import ( + test_function_multiple_argument_with_mixed_statistics_and_typehints, + ) + + function_argument = HopsworksUdf._extract_function_arguments( + test_function_multiple_argument_with_mixed_statistics_and_typehints + ) + + assert function_argument == [ + TransformationFeature(feature_name="arg1", statistic_argument_name="arg1"), + TransformationFeature(feature_name="arg2", statistic_argument_name=None), + TransformationFeature(feature_name="arg3", statistic_argument_name="arg3"), + ] + + def test_extract_function_arguments_multiple_argument_all_parameter_with_spaces( + self, + ): + from test_helpers.transformation_test_helper import ( + test_function_multiple_argument_all_parameter_with_spaces, + ) + + function_argument = HopsworksUdf._extract_function_arguments( + test_function_multiple_argument_all_parameter_with_spaces + ) + + assert function_argument == [ + TransformationFeature(feature_name="arg1", statistic_argument_name="arg1"), + TransformationFeature(feature_name="arg2", statistic_argument_name="arg2"), + ] + + def test_extract_function_arguments_multiple_argument_all_parameter_multiline(self): + from test_helpers.transformation_test_helper import ( + test_function_multiple_argument_all_parameter_multiline, + ) + + function_argument = HopsworksUdf._extract_function_arguments( + test_function_multiple_argument_all_parameter_multiline + ) + + assert function_argument == [ + TransformationFeature(feature_name="arg1", statistic_argument_name="arg1"), + TransformationFeature(feature_name="arg2", statistic_argument_name=None), + TransformationFeature(feature_name="arg3", statistic_argument_name="arg3"), + ] + + def test_extract_function_arguments_multiple_argumen_all_parameter_multiline_with_comments( + self, + ): + from test_helpers.transformation_test_helper import ( + test_function_multiple_argument_all_parameter_multiline_with_comments, + ) + + function_argument = HopsworksUdf._extract_function_arguments( + test_function_multiple_argument_all_parameter_multiline_with_comments + ) + + assert function_argument == [ + TransformationFeature(feature_name="arg1", statistic_argument_name="arg1"), + TransformationFeature(feature_name="arg2", statistic_argument_name=None), + TransformationFeature(feature_name="arg3", statistic_argument_name="arg3"), + ] + + def test_extract_function_arguments_statistics_invalid(self): + from test_helpers.transformation_test_helper import ( + test_function_statistics_invalid, + ) + + with pytest.raises(FeatureStoreException) as exception: + HopsworksUdf._extract_function_arguments(test_function_statistics_invalid) + + assert ( + str(exception.value) + == "No argument corresponding to statistics parameter 'arg3' present in function definition." + ) + + def test_format_source_code(self): + from test_helpers.transformation_test_helper import ( + test_function_multiple_argument_all_parameter_multiline_with_comments, + ) + + function_source = HopsworksUdf._extract_source_code( + test_function_multiple_argument_all_parameter_multiline_with_comments + ) + + formated_source, module_imports = HopsworksUdf._format_source_code( + function_source + ) + + assert ( + formated_source.strip() + == """def test_function_multiple_argument_all_parameter_multiline_with_comments(arg1, arg2, arg3): +\t pass""" + ) + + def test_generate_output_column_names_one_argument_one_output_type(self): + @udf(int) + def test_func(col1): + return col1 + 1 + + test_func.udf_type = UDFType.MODEL_DEPENDENT + assert test_func._get_output_column_names() == ["test_func_col1_"] + + test_func.udf_type = UDFType.ON_DEMAND + assert test_func._get_output_column_names() == ["test_func"] + + def test_generate_output_column_names_one_argument_one_output_type_prefix(self): + @udf(int) + def test_func(col1): + return col1 + 1 + + test_func._feature_name_prefix = "prefix_" + + test_func.udf_type = UDFType.MODEL_DEPENDENT + assert test_func._get_output_column_names() == ["test_func_prefix_col1_"] + assert test_func.output_column_names == ["prefix_test_func_prefix_col1_"] + + test_func.udf_type = UDFType.ON_DEMAND + assert test_func._get_output_column_names() == ["test_func"] + assert test_func.output_column_names == ["prefix_test_func"] + + def test_generate_output_column_names_multiple_argument_one_output_type(self): + @udf(int) + def test_func(col1, col2, col3): + return col1 + 1 + + test_func.udf_type = UDFType.MODEL_DEPENDENT + assert test_func._get_output_column_names() == ["test_func_col1_col2_col3_"] + test_func.udf_type = UDFType.ON_DEMAND + assert test_func._get_output_column_names() == ["test_func"] + + def test_generate_output_column_names_multiple_argument_one_output_type_prefix( + self, + ): + @udf(int) + def test_func(col1, col2, col3): + return col1 + 1 + + test_func._feature_name_prefix = "prefix_" + + test_func.udf_type = UDFType.MODEL_DEPENDENT + assert test_func._get_output_column_names() == [ + "test_func_prefix_col1_prefix_col2_prefix_col3_" + ] + assert test_func.output_column_names == [ + "prefix_test_func_prefix_col1_prefix_col2_prefix_col3_" + ] + test_func.udf_type = UDFType.ON_DEMAND + assert test_func._get_output_column_names() == ["test_func"] + assert test_func.output_column_names == ["prefix_test_func"] + + def test_generate_output_column_names_single_argument_multiple_output_type(self): + @udf([int, float, int]) + def test_func(col1): + return pd.DataFrame( + {"col1": [col1 + 1], "col2": [col1 + 1], "col3": [col1 + 1]} + ) + + test_func.udf_type = UDFType.MODEL_DEPENDENT + assert test_func._get_output_column_names() == [ + "test_func_col1_0", + "test_func_col1_1", + "test_func_col1_2", + ] + + def test_generate_output_column_names_single_argument_multiple_output_type_prefix( + self, + ): + @udf([int, float, int]) + def test_func(col1): + return pd.DataFrame( + {"col1": [col1 + 1], "col2": [col1 + 1], "col3": [col1 + 1]} + ) + + test_func._feature_name_prefix = "prefix_" + + test_func.udf_type = UDFType.MODEL_DEPENDENT + assert test_func._get_output_column_names() == [ + "test_func_prefix_col1_0", + "test_func_prefix_col1_1", + "test_func_prefix_col1_2", + ] + assert test_func.output_column_names == [ + "prefix_test_func_prefix_col1_0", + "prefix_test_func_prefix_col1_1", + "prefix_test_func_prefix_col1_2", + ] + + def test_generate_output_column_names_multiple_argument_multiple_output_type(self): + @udf([int, float, int]) + def test_func(col1, col2, col3): + return pd.DataFrame( + {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]} + ) + + test_func.udf_type = UDFType.MODEL_DEPENDENT + assert test_func._get_output_column_names() == [ + "test_func_col1_col2_col3_0", + "test_func_col1_col2_col3_1", + "test_func_col1_col2_col3_2", + ] + + def test_generate_output_column_names_multiple_argument_multiple_output_type_prefix( + self, + ): + @udf([int, float, int]) + def test_func(col1, col2, col3): + return pd.DataFrame( + {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]} + ) + + test_func._feature_name_prefix = "prefix_" + + test_func.udf_type = UDFType.MODEL_DEPENDENT + assert test_func._get_output_column_names() == [ + "test_func_prefix_col1_prefix_col2_prefix_col3_0", + "test_func_prefix_col1_prefix_col2_prefix_col3_1", + "test_func_prefix_col1_prefix_col2_prefix_col3_2", + ] + assert test_func.output_column_names == [ + "prefix_test_func_prefix_col1_prefix_col2_prefix_col3_0", + "prefix_test_func_prefix_col1_prefix_col2_prefix_col3_1", + "prefix_test_func_prefix_col1_prefix_col2_prefix_col3_2", + ] + + def test_drop_features_one_element(self): + @udf([int, float, int], drop="col1") + def test_func(col1, col2, col3): + return pd.DataFrame( + {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]} + ) + + test_func.udf_type = UDFType.MODEL_DEPENDENT + + assert test_func.dropped_features == ["col1"] + + def test_drop_features_one_element_prefix(self): + @udf([int, float, int], drop="col1") + def test_func(col1, col2, col3): + return pd.DataFrame( + {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]} + ) + + test_func._feature_name_prefix = "prefix_" + test_func.udf_type = UDFType.MODEL_DEPENDENT + + assert test_func._dropped_features == ["col1"] + assert test_func.dropped_features == ["prefix_col1"] + + def test_drop_features_multiple_element(self): + @udf([int, float, int], drop=["col1", "col2"]) + def test_func(col1, col2, col3): + return pd.DataFrame( + {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]} + ) + + test_func.udf_type = UDFType.MODEL_DEPENDENT + + assert test_func.dropped_features == ["col1", "col2"] + + def test_drop_features_multiple_element_prefix(self): + @udf([int, float, int], drop=["col1", "col2"]) + def test_func(col1, col2, col3): + return pd.DataFrame( + {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]} + ) + + test_func._feature_name_prefix = "prefix_" + test_func.udf_type = UDFType.MODEL_DEPENDENT + + assert test_func._dropped_features == ["col1", "col2"] + assert test_func.dropped_features == ["prefix_col1", "prefix_col2"] + + def test_drop_features_invalid(self): + with pytest.raises(FeatureStoreException) as exp: + + @udf([int, float, int], drop=["col1", "invalid_col"]) + def test_func(col1, col2, col3): + return pd.DataFrame( + {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]} + ) + + assert ( + str(exp.value) + == "Cannot drop features 'invalid_col' as they are not features given as arguments in the defined UDF." + ) + + def test_create_pandas_udf_return_schema_from_list_one_output_type(self): + @udf(int) + def test_func(col1): + return col1 + 1 + + assert test_func._create_pandas_udf_return_schema_from_list() == "bigint" + + def test_create_pandas_udf_return_schema_from_list_one_argument_multiple_output_type( + self, + ): + @udf([int, float, str, date, datetime, time, bool]) + def test_func(col1): + return pd.DataFrame( + { + "col1": [col1 + 1], + "col2": [col1 + 1], + "col3": [col1 + 1], + "col4": [col1 + 1], + "col5": [col1 + 1], + "col6": [True], + } + ) + + test_func.udf_type = UDFType.MODEL_DEPENDENT + + assert ( + test_func._create_pandas_udf_return_schema_from_list() + == "`test_func_col1_0` bigint, `test_func_col1_1` double, `test_func_col1_2` string, `test_func_col1_3` date, `test_func_col1_4` timestamp, `test_func_col1_5` timestamp, `test_func_col1_6` boolean" + ) + + def test_hopsworks_wrapper_single_output(self): + test_dataframe = pd.DataFrame({"col1": [1, 2, 3, 4]}) + + @udf(int) + def test_func(col1): + return col1 + 1 + + test_func.udf_type = UDFType.MODEL_DEPENDENT + + renaming_wrapper_function = test_func.hopsworksUdf_wrapper() + + result = renaming_wrapper_function(test_dataframe["col1"]) + + assert result.name == "test_func_col1_" + assert result.values.tolist() == [2, 3, 4, 5] + + test_func.udf_type = UDFType.ON_DEMAND + + renaming_wrapper_function = test_func.hopsworksUdf_wrapper() + + result = renaming_wrapper_function(test_dataframe["col1"]) + + assert result.name == "test_func" + assert result.values.tolist() == [2, 3, 4, 5] + + def test_hopsworks_wrapper_multiple_output(self): + @udf([int, float]) + def test_func(col1, col2): + return pd.DataFrame({"out1": col1 + 1, "out2": col2 + 2}) + + test_func.udf_type = UDFType.MODEL_DEPENDENT + + renaming_wrapper_function = test_func.hopsworksUdf_wrapper() + + test_dataframe = pd.DataFrame( + {"column1": [1, 2, 3, 4], "column2": [10, 20, 30, 40]} + ) + + result = renaming_wrapper_function( + test_dataframe["column1"], test_dataframe["column2"] + ) + + assert all(result.columns == ["test_func_col1_col2_0", "test_func_col1_col2_1"]) + assert result.values.tolist() == [[2, 12], [3, 22], [4, 32], [5, 42]] + + def test_HopsworkUDf_call_one_argument(self): + @udf(int) + def test_func(col1): + return col1 + 1 + + assert test_func.transformation_features == ["col1"] + assert test_func.statistics_features == [] + + assert test_func("new_feature").transformation_features == ["new_feature"] + assert test_func("new_feature").statistics_features == [] + + # Test with prefix + test_func._feature_name_prefix = "prefix_" + assert test_func.transformation_features == ["prefix_col1"] + assert test_func.statistics_features == [] + + assert test_func("new_feature").transformation_features == [ + "prefix_new_feature" + ] + assert test_func("new_feature").statistics_features == [] + + def test_HopsworkUDf_call_one_argument_statistics(self): + from hsfs.transformation_statistics import TransformationStatistics + + stats = TransformationStatistics("col1") + + @udf(int) + def test_func(col1, statistics=stats): + return col1 + statistics.col1.mean + + assert test_func.transformation_features == ["col1"] + assert test_func.statistics_features == ["col1"] + assert test_func._statistics_argument_names == ["col1"] + + assert test_func("new_feature").transformation_features == ["new_feature"] + assert test_func("new_feature").statistics_features == ["new_feature"] + assert test_func("new_feature")._statistics_argument_names == ["col1"] + + # Test with prefix + test_func._feature_name_prefix = "prefix_" + assert test_func.transformation_features == ["prefix_col1"] + assert test_func.statistics_features == ["col1"] + assert test_func._statistics_argument_names == ["col1"] + + assert test_func("new_feature").transformation_features == [ + "prefix_new_feature" + ] + assert test_func("new_feature").statistics_features == ["new_feature"] + assert test_func("new_feature")._statistics_argument_names == ["col1"] + + def test_HopsworkUDf_call_multiple_argument_statistics(self): + from hsfs.transformation_statistics import TransformationStatistics + + stats = TransformationStatistics("col1", "col3") + + @udf(int) + def test_func(col1, col2, col3, statistics=stats): + return col1 + statistics.col1.mean + statistics.col3.mean + + assert test_func.transformation_features == ["col1", "col2", "col3"] + assert test_func.statistics_features == ["col1", "col3"] + + assert test_func("f1", "f2", "f3").transformation_features == ["f1", "f2", "f3"] + assert test_func("f1", "f2", "f3").statistics_features == ["f1", "f3"] + assert test_func("f1", "f2", "f3")._statistics_argument_names == [ + "col1", + "col3", + ] + + def test_validate_and_convert_drop_features(self): + dropped_features = "feature1" + transformation_feature = ["feature1", "feature2"] + feature_name_prefix = None + + dropped_features = HopsworksUdf._validate_and_convert_drop_features( + dropped_features, transformation_feature, feature_name_prefix + ) + + assert dropped_features == ["feature1"] + + def test_validate_and_convert_drop_features_dropped_list(self): + dropped_features = ["feature1", "feature2"] + transformation_feature = ["feature1", "feature2", "feature3"] + feature_name_prefix = None + + dropped_features = HopsworksUdf._validate_and_convert_drop_features( + dropped_features, transformation_feature, feature_name_prefix + ) + + assert dropped_features == ["feature1", "feature2"] + + def test_validate_and_convert_drop_features_dropped_invalid(self): + dropped_features = "feature4" + transformation_feature = ["feature1", "feature2", "feature3"] + feature_name_prefix = None + + with pytest.raises(FeatureStoreException) as exp: + HopsworksUdf._validate_and_convert_drop_features( + dropped_features, transformation_feature, feature_name_prefix + ) + + assert ( + str(exp.value) + == "Cannot drop features 'feature4' as they are not features given as arguments in the defined UDF." + ) + + def test_validate_and_convert_drop_features_dropped_invalid_list(self): + dropped_features = ["feature4", "feature5"] + transformation_feature = ["feature1", "feature2", "feature3"] + feature_name_prefix = None + + with pytest.raises(FeatureStoreException) as exp: + HopsworksUdf._validate_and_convert_drop_features( + dropped_features, transformation_feature, feature_name_prefix + ) + + assert ( + str(exp.value) + == "Cannot drop features 'feature4', 'feature5' as they are not features given as arguments in the defined UDF." + ) + + def test_validate_and_convert_drop_features_dropped_list_prefix(self): + dropped_features = ["feature1", "feature2"] + transformation_feature = ["test_feature1", "test_feature2", "test_feature3"] + feature_name_prefix = "test_" + + dropped_features = HopsworksUdf._validate_and_convert_drop_features( + dropped_features, transformation_feature, feature_name_prefix + ) + + assert dropped_features == ["feature1", "feature2"] + + def test_validate_and_convert_drop_features_dropped_prefix_invalid(self): + dropped_features = ["feature1", "feature2"] + transformation_feature = ["feature1", "feature2", "feature3"] + feature_name_prefix = "test_" + + with pytest.raises(FeatureStoreException) as exp: + HopsworksUdf._validate_and_convert_drop_features( + dropped_features, transformation_feature, feature_name_prefix + ) + + assert ( + str(exp.value) + == "Cannot drop features 'test_feature1', 'test_feature2' as they are not features given as arguments in the defined UDF." + ) + + def test_validate_udf_type_None(self): + @udf(int) + def test_func(col1): + return col1 + 1 + + with pytest.raises(FeatureStoreException) as exe: + test_func._validate_udf_type() + test_func.get_udf() + + assert str(exe.value) == "UDF Type cannot be None" + + def test_validate_udf_type_on_demand_multiple_output(self): + @udf([int, float]) + def test_func(col1, col2): + return pd.DataFrame({"out1": col1 + 1, "out2": col2 + 2}) + + with pytest.raises(FeatureStoreException) as exe: + test_func.udf_type = UDFType.ON_DEMAND + + assert ( + str(exe.value) + == "On-Demand Transformation functions can only return one column as output" + ) + + def test_validate_udf_type_on_demand_statistics(self): + from hsfs.transformation_statistics import TransformationStatistics + + stats = TransformationStatistics("col1") + + @udf(int) + def test_func(col1, statistics=stats): + return col1 + statistics.col1.mean + + with pytest.raises(FeatureStoreException) as exe: + test_func.udf_type = UDFType.ON_DEMAND + + assert ( + str(exe.value) + == "On-Demand Transformation functions cannot use statistics, please remove statistics parameters from the functions" + ) diff --git a/python/tests/test_training_dataset.py b/python/tests/test_training_dataset.py index 416f3cb860..be771406b2 100644 --- a/python/tests/test_training_dataset.py +++ b/python/tests/test_training_dataset.py @@ -57,7 +57,6 @@ def test_from_response_json(self, mocker, backend_fixtures): assert td._from_query == "test_from_query" assert td._querydto == "test_querydto" assert td.feature_store_id == 22 - assert td.transformation_functions == "test_transformation_functions" assert td.train_split == "test_train_split" assert td.training_dataset_type == "HOPSFS_TRAINING_DATASET" assert isinstance(td.storage_connector, storage_connector.JdbcConnector) @@ -102,7 +101,6 @@ def test_from_response_json_basic_info(self, mocker, backend_fixtures): assert td._from_query is None assert td._querydto is None assert td.feature_store_id == 22 - assert td.transformation_functions is None assert td.train_split is None assert td.training_dataset_type is None assert isinstance(td.storage_connector, storage_connector.JdbcConnector) diff --git a/python/tests/test_training_dataset_feature.py b/python/tests/test_training_dataset_feature.py index 62a30aca5a..81c7fd6d14 100644 --- a/python/tests/test_training_dataset_feature.py +++ b/python/tests/test_training_dataset_feature.py @@ -15,7 +15,8 @@ # -from hsfs import feature_group, training_dataset_feature, transformation_function +from hsfs import feature_group, training_dataset_feature +from hsfs.hopsworks_udf import UDFType class TestTrainingDatasetFeature: @@ -37,12 +38,41 @@ def test_from_response_json(self, backend_fixtures): td_feature._feature_group_feature_name == "test_feature_group_feature_name" ) assert td_feature.label == "test_label" - assert len(td_feature.transformation_function) == 1 - assert isinstance( - td_feature.transformation_function[0], - transformation_function.TransformationFunction, + + def test_from_response_json_on_demand_transformation(self, backend_fixtures): + # Arrange + json = backend_fixtures["training_dataset_feature"]["get_transformations"][ + "response" + ] + + # Act + td_feature = training_dataset_feature.TrainingDatasetFeature.from_response_json( + json ) + # Assert + assert td_feature.name == "test_name" + assert td_feature.type == "test_type" + assert td_feature.index == "test_index" + assert ( + td_feature.on_demand_transformation_function.hopsworks_udf.function_name + == "add_one_fs" + ) + + assert ( + td_feature.on_demand_transformation_function.hopsworks_udf._function_source + == "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n" + ) + assert ( + td_feature.on_demand_transformation_function.hopsworks_udf.udf_type + == UDFType.ON_DEMAND + ) + assert isinstance(td_feature._feature_group, feature_group.FeatureGroup) + assert ( + td_feature._feature_group_feature_name == "test_feature_group_feature_name" + ) + assert td_feature.label == "test_label" + def test_from_response_json_basic_info(self, backend_fixtures): # Arrange json = backend_fixtures["training_dataset_feature"]["get_basic_info"][ @@ -61,4 +91,3 @@ def test_from_response_json_basic_info(self, backend_fixtures): assert td_feature._feature_group is None assert td_feature._feature_group_feature_name is None assert td_feature.label is False - assert td_feature.transformation_function is None diff --git a/python/tests/test_transformation_function.py b/python/tests/test_transformation_function.py index 41123ff791..0b83832755 100644 --- a/python/tests/test_transformation_function.py +++ b/python/tests/test_transformation_function.py @@ -15,85 +15,225 @@ # -from hsfs import transformation_function +import pytest +from hsfs.client.exceptions import FeatureStoreException +from hsfs.hopsworks_udf import UDFType, udf +from hsfs.transformation_function import TransformationFunction class TestTransformationFunction: - def test_from_response_json(self, backend_fixtures): + def test_from_response_json_one_argument_no_statistics(self, backend_fixtures): # Arrange - json = backend_fixtures["transformation_function"]["get"]["response"] + json = backend_fixtures["transformation_function"][ + "get_one_argument_no_statistics_function" + ]["response"] + json["transformation_type"] = UDFType.MODEL_DEPENDENT + # Act + tf = TransformationFunction.from_response_json(json) + + # Assert + assert tf.id == 1 + assert tf._featurestore_id == 11 + assert tf.version == 2 + assert tf.hopsworks_udf.function_name == "add_one_fs" + assert tf.hopsworks_udf.return_types == ["double"] + assert not tf.hopsworks_udf.statistics_required + assert tf.hopsworks_udf.transformation_features == ["col1"] + assert tf.hopsworks_udf.statistics_features == [] + assert tf.hopsworks_udf._statistics_argument_names == [] + assert ( + tf.hopsworks_udf._function_source + == "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n" + ) + + def test_from_response_json_one_argument_with_statistics(self, backend_fixtures): + # Arrange + json = backend_fixtures["transformation_function"][ + "get_one_argument_with_statistics_function" + ]["response"] + json["transformation_type"] = UDFType.MODEL_DEPENDENT # Act - tf = transformation_function.TransformationFunction.from_response_json(json) + tf = TransformationFunction.from_response_json(json) # Assert - assert tf.id == 43 + assert tf.id == 1 assert tf._featurestore_id == 11 - assert tf.version == 1 - assert tf.name == "test_name" - assert tf.transformation_fn is None - assert tf.output_type == "FLOAT" + assert tf.version == 2 + assert tf.hopsworks_udf.function_name == "add_mean_fs" + assert tf.hopsworks_udf.return_types == ["double"] + assert tf.hopsworks_udf.statistics_required + assert tf.hopsworks_udf.transformation_features == ["data"] + assert tf.hopsworks_udf.statistics_features == ["data"] + assert tf.hopsworks_udf._statistics_argument_names == ["data1"] assert ( - tf.source_code_content - == '{"module_imports": "", "transformer_code": "test_builtin_source_code"}' + tf.hopsworks_udf._function_source + == "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n return data1 + statistics.data1.mean\n" ) - assert tf._feature_group_feature_name is None - assert tf._feature_group_id is None - def test_from_response_json_basic_info(self, mocker, backend_fixtures): + def test_from_response_json_multiple_argument_with_statistics( + self, backend_fixtures + ): # Arrange - mocker.patch( - "hsfs.transformation_function.TransformationFunction._load_source_code" + json = backend_fixtures["transformation_function"][ + "get_multiple_argument_with_statistics_function" + ]["response"] + json["transformation_type"] = UDFType.MODEL_DEPENDENT + + # Act + tf = TransformationFunction.from_response_json(json) + + # Assert + assert tf.id == 1 + assert tf._featurestore_id == 11 + assert tf.version == 2 + assert tf.hopsworks_udf.function_name == "test_func" + assert tf.hopsworks_udf.return_types == ["string"] + assert tf.hopsworks_udf.statistics_required + assert tf.hopsworks_udf.transformation_features == [ + "feature1", + "feature2", + "feature3", + ] + assert tf.hopsworks_udf.statistics_features == ["feature1", "feature2"] + assert tf.hopsworks_udf._statistics_argument_names == ["data1", "data2"] + assert ( + tf.hopsworks_udf._function_source + == "\n@udf(str)\ndef test_func(data1 : pd.Series, data2, data3, statistics=stats):\n return data1 + statistics.data1.mean\n" ) - json = backend_fixtures["transformation_function"]["get_basic_info"]["response"] + + def test_from_response_json_multiple_return_type_functions(self, backend_fixtures): + # Arrange + json = backend_fixtures["transformation_function"][ + "get_multiple_return_type_functions" + ]["response"] + json["transformation_type"] = UDFType.MODEL_DEPENDENT # Act - tf = transformation_function.TransformationFunction.from_response_json(json) + tf = TransformationFunction.from_response_json(json) # Assert - assert tf.id is None + assert tf.id == 1 assert tf._featurestore_id == 11 - assert tf.version is None - assert tf.name is None - assert tf.transformation_fn is None - assert tf.output_type == "STRING" - assert tf.source_code_content is None - assert tf._feature_group_feature_name is None - assert tf._feature_group_id is None + assert tf.version == 2 + assert tf.hopsworks_udf.function_name == "test_func" + assert tf.hopsworks_udf.return_types == ["string", "double"] + assert tf.hopsworks_udf.statistics_required + assert tf.hopsworks_udf.transformation_features == [ + "feature1", + "feature2", + "feature3", + ] + assert tf.hopsworks_udf.statistics_features == ["feature1", "feature2"] + assert tf.hopsworks_udf._statistics_argument_names == ["data1", "data2"] + assert ( + tf.hopsworks_udf._function_source + == "\n@udf(str, float)\ndef test_func(data1 : pd.Series, data2, data3, statistics=stats):\n return pd.DataFrame('col1': ['a', 'b'], 'col2':[1,2])\n" + ) + + def test_from_response_json_list_empty(self, backend_fixtures): + # Arrange + json = backend_fixtures["transformation_function"]["get_list_empty"]["response"] + + # Act + tf_list = TransformationFunction.from_response_json(json) + + # Assert + assert len(tf_list) == 0 def test_from_response_json_list(self, backend_fixtures): # Arrange json = backend_fixtures["transformation_function"]["get_list"]["response"] + for response_json in json["items"]: + response_json["transformation_type"] = UDFType.MODEL_DEPENDENT # Act - tf_list = transformation_function.TransformationFunction.from_response_json( - json - ) + tf_list = TransformationFunction.from_response_json(json) # Assert - assert len(tf_list) == 1 + assert len(tf_list) == 2 tf = tf_list[0] - assert tf.id == 43 + assert tf.id == 1 + assert tf._featurestore_id == 11 + assert tf.version == 2 + assert tf.hopsworks_udf.function_name == "add_mean_fs" + assert tf.hopsworks_udf.return_types == ["double"] + assert tf.hopsworks_udf.statistics_required + assert tf.hopsworks_udf.transformation_features == ["data"] + assert tf.hopsworks_udf.statistics_features == ["data"] + assert tf.hopsworks_udf._statistics_argument_names == ["data1"] + assert ( + tf.hopsworks_udf._function_source + == "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n return data1 + statistics.data1.mean\n" + ) + + tf = tf_list[1] + assert tf.id == 2 assert tf._featurestore_id == 11 assert tf.version == 1 - assert tf.name == "test_name" - assert tf.transformation_fn is None - assert tf.output_type == "FLOAT" + assert tf.hopsworks_udf.function_name == "add_one_fs" + assert tf.hopsworks_udf.return_types == ["double"] + assert not tf.hopsworks_udf.statistics_required + assert tf.hopsworks_udf.transformation_features == ["col1"] + assert tf.hopsworks_udf.statistics_features == [] + assert tf.hopsworks_udf._statistics_argument_names == [] assert ( - tf.source_code_content - == '{"module_imports": "", "transformer_code": "test_builtin_source_code"}' + tf.hopsworks_udf._function_source + == "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n return data1 + 1\n" ) - assert tf._feature_group_feature_name is None - assert tf._feature_group_id is None - def test_from_response_json_list_empty(self, backend_fixtures): + def test_from_response_json_list_one_argument(self, backend_fixtures): # Arrange - json = backend_fixtures["transformation_function"]["get_list_empty"]["response"] + json = backend_fixtures["transformation_function"]["get_list_one_argument"][ + "response" + ] + for response_json in json["items"]: + response_json["transformation_type"] = UDFType.MODEL_DEPENDENT # Act - tf_list = transformation_function.TransformationFunction.from_response_json( - json - ) + tf = TransformationFunction.from_response_json(json) # Assert - assert len(tf_list) == 0 + assert not isinstance(tf, list) + assert tf.id == 1 + assert tf._featurestore_id == 11 + assert tf.version == 2 + assert tf.hopsworks_udf.function_name == "add_mean_fs" + assert tf.hopsworks_udf.return_types == ["double"] + assert tf.hopsworks_udf.statistics_required + assert tf.hopsworks_udf.transformation_features == ["data"] + assert tf.hopsworks_udf.statistics_features == ["data"] + assert tf.hopsworks_udf._statistics_argument_names == ["data1"] + assert ( + tf.hopsworks_udf._function_source + == "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n return data1 + statistics.data1.mean\n" + ) + + def test_transformation_function_definition_no_hopworks_udf(self): + def test(col1): + return col1 + 1 + + with pytest.raises(FeatureStoreException) as exception: + TransformationFunction( + featurestore_id=10, + hopsworks_udf=test, + transformation_type=UDFType.MODEL_DEPENDENT, + ) + + assert ( + str(exception.value) + == "Please use the hopsworks_udf decorator when defining transformation functions." + ) + + def test_transformation_function_definition_with_hopworks_udf(self): + @udf(int) + def test2(col1): + return col1 + 1 + + tf = TransformationFunction( + featurestore_id=10, + hopsworks_udf=test2, + transformation_type=UDFType.MODEL_DEPENDENT, + ) + + assert tf.hopsworks_udf == test2 diff --git a/python/tests/test_transformation_function_attached.py b/python/tests/test_transformation_function_attached.py deleted file mode 100644 index 85effdd06e..0000000000 --- a/python/tests/test_transformation_function_attached.py +++ /dev/null @@ -1,88 +0,0 @@ -# -# Copyright 2022 Hopsworks AB -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - - -from hsfs import transformation_function, transformation_function_attached - - -class TestTransformationFunctionAttached: - def test_from_response_json(self, backend_fixtures): - # Arrange - json = backend_fixtures["transformation_function_attached"]["get"]["response"] - - # Act - tf_attached = transformation_function_attached.TransformationFunctionAttached.from_response_json( - json - ) - - # Assert - assert tf_attached.name == "test_name" - assert isinstance( - tf_attached.transformation_function, - transformation_function.TransformationFunction, - ) - - def test_from_response_json_basic_info(self, backend_fixtures): - # Arrange - json = backend_fixtures["transformation_function_attached"]["get_basic_info"][ - "response" - ] - - # Act - tf_attached = transformation_function_attached.TransformationFunctionAttached.from_response_json( - json - ) - - # Assert - assert tf_attached.name == "test_name" - assert isinstance( - tf_attached.transformation_function, - transformation_function.TransformationFunction, - ) - - def test_from_response_json_list(self, backend_fixtures): - # Arrange - json = backend_fixtures["transformation_function_attached"]["get_list"][ - "response" - ] - - # Act - tf_attached_list = transformation_function_attached.TransformationFunctionAttached.from_response_json( - json - ) - - # Assert - assert len(tf_attached_list) == 1 - tf_attached = tf_attached_list[0] - assert tf_attached.name == "test_name" - assert isinstance( - tf_attached.transformation_function, - transformation_function.TransformationFunction, - ) - - def test_from_response_json_list_empty(self, backend_fixtures): - # Arrange - json = backend_fixtures["transformation_function_attached"]["get_list_empty"][ - "response" - ] - - # Act - tf_attached_list = transformation_function_attached.TransformationFunctionAttached.from_response_json( - json - ) - - # Assert - assert len(tf_attached_list) == 0