From 4d30232369232f77cde97383c09ee6c3994e3403 Mon Sep 17 00:00:00 2001 From: najiyacl Date: Wed, 12 Jul 2023 17:23:59 +0530 Subject: [PATCH 01/15] model save changes --- ads/feature_store/common/utils/utility.py | 11 +- ads/feature_store/mixin/oci_feature_store.py | 6 + ads/model/generic_model.py | 262 ++++++++++--------- 3 files changed, 156 insertions(+), 123 deletions(-) diff --git a/ads/feature_store/common/utils/utility.py b/ads/feature_store/common/utils/utility.py index e7fae76e5..d98589de2 100644 --- a/ads/feature_store/common/utils/utility.py +++ b/ads/feature_store/common/utils/utility.py @@ -19,7 +19,7 @@ from ads.feature_store.feature_group_expectation import Rule, Expectation from ads.feature_store.input_feature_detail import FeatureDetail from ads.feature_store.common.spark_session_singleton import SparkSessionSingleton - +import re try: from pyspark.pandas import DataFrame except ModuleNotFoundError: @@ -401,3 +401,12 @@ def validate_input_feature_details(input_feature_details, data_frame): if isinstance(data_frame, pd.DataFrame): return convert_pandas_datatype_with_schema(input_feature_details, data_frame) return convert_spark_dataframe_with_schema(input_feature_details, data_frame) + + +def validate_model_ocid(model_ocid): + pattern = r'^ocid1\.datasciencemodel\.oc(?P[0-17]+)\.(?P[A-Za-z0-9]+)?\.?(?P[A-Za-z0-9]+)?\.(?P[A-Za-z0-9]+)$' + match = re.match(pattern, model_ocid) + if match: + # groups = match.groupdict() + return True + return False diff --git a/ads/feature_store/mixin/oci_feature_store.py b/ads/feature_store/mixin/oci_feature_store.py index 85392f027..09a2ff108 100644 --- a/ads/feature_store/mixin/oci_feature_store.py +++ b/ads/feature_store/mixin/oci_feature_store.py @@ -6,6 +6,7 @@ from ads.common.oci_mixin import OCIModelMixin import oci.feature_store +import os class OCIFeatureStoreMixin(OCIModelMixin): @@ -13,6 +14,11 @@ class OCIFeatureStoreMixin(OCIModelMixin): def init_client( cls, **kwargs ) -> oci.feature_store.feature_store_client.FeatureStoreClient: + + fs_service_endpoint = os.environ.get("OCI_FS_SERVICE_ENDPOINT") + kwargs = {"service_endpoint": fs_service_endpoint} + + client = cls._init_client( client=oci.feature_store.feature_store_client.FeatureStoreClient, **kwargs ) diff --git a/ads/model/generic_model.py b/ads/model/generic_model.py index f39aa27dc..ae517c886 100644 --- a/ads/model/generic_model.py +++ b/ads/model/generic_model.py @@ -65,7 +65,7 @@ Framework, ModelCustomMetadata, ModelProvenanceMetadata, - ModelTaxonomyMetadata, + ModelTaxonomyMetadata, MetadataCustomCategory, ) from ads.model.model_metadata_mixin import MetadataMixin from ads.model.model_properties import ModelProperties @@ -99,7 +99,6 @@ MODEL_DEPLOYMENT_INSTANCE_COUNT = 1 MODEL_DEPLOYMENT_BANDWIDTH_MBPS = 10 - DEFAULT_MODEL_FOLDER_NAME = "model" ONNX_DATA_TRANSFORMER = "onnx_data_transformer.json" @@ -136,7 +135,7 @@ class DataScienceModelType(str, metaclass=ExtendedEnumMeta): MODEL = "datasciencemodel" -class NotActiveDeploymentError(Exception): # pragma: no cover +class NotActiveDeploymentError(Exception): # pragma: no cover def __init__(self, state: str): msg = ( "To perform a prediction the deployed model needs to be in an active state. " @@ -145,15 +144,15 @@ def __init__(self, state: str): super().__init__(msg) -class SerializeModelNotImplementedError(NotImplementedError): # pragma: no cover +class SerializeModelNotImplementedError(NotImplementedError): # pragma: no cover pass -class SerializeInputNotImplementedError(NotImplementedError): # pragma: no cover +class SerializeInputNotImplementedError(NotImplementedError): # pragma: no cover pass -class RuntimeInfoInconsistencyError(Exception): # pragma: no cover +class RuntimeInfoInconsistencyError(Exception): # pragma: no cover pass @@ -349,9 +348,9 @@ def __init__( Instance of ads.model.SERDE. Used for serialize/deserialize model input. """ if ( - artifact_dir - and ObjectStorageDetails.is_oci_path(artifact_dir) - and not self._PREFIX == "spark" + artifact_dir + and ObjectStorageDetails.is_oci_path(artifact_dir) + and not self._PREFIX == "spark" ): raise ValueError( f"Unsupported value of `artifact_dir`: {artifact_dir}. " @@ -416,9 +415,9 @@ def __init__( self.ignore_conda_error = False def _init_serde( - self, - model_input_serde: Union[SERDE, str] = None, - model_save_serializer: Union[SERDE, str] = None, + self, + model_input_serde: Union[SERDE, str] = None, + model_save_serializer: Union[SERDE, str] = None, ): """Initializes serde. @@ -513,8 +512,8 @@ def _to_yaml(self): return yaml.safe_dump(self._to_dict()) def set_model_input_serializer( - self, - model_input_serializer: Union[str, SERDE], + self, + model_input_serializer: Union[str, SERDE], ): """Registers serializer used for serializing data passed in verify/predict. @@ -646,12 +645,12 @@ def set_model_save_serializer(self, model_save_serializer: Union[str, SERDE]): ) def serialize_model( - self, - as_onnx: bool = False, - initial_types: List[Tuple] = None, - force_overwrite: bool = False, - X_sample: any = None, - **kwargs, + self, + as_onnx: bool = False, + initial_types: List[Tuple] = None, + force_overwrite: bool = False, + X_sample: any = None, + **kwargs, ): """ Serialize and save model using ONNX or model specific method. @@ -689,11 +688,11 @@ def serialize_model( ) def _serialize_model_helper( - self, - initial_types: List[Tuple] = None, - force_overwrite: bool = False, - X_sample: any = None, - **kwargs, + self, + initial_types: List[Tuple] = None, + force_overwrite: bool = False, + X_sample: any = None, + **kwargs, ): model_path = self._check_model_file( self.model_file_name, force_overwrite=force_overwrite @@ -753,17 +752,17 @@ def _set_model_save_serializer_to_onnx(self): raise e def _onnx_data_transformer( - self, - X: Union[pd.DataFrame, pd.Series], - impute_values: Dict = None, - force_overwrite: bool = False, + self, + X: Union[pd.DataFrame, pd.Series], + impute_values: Dict = None, + force_overwrite: bool = False, ): """Apply onnx data transformer to data.""" if self.framework in FRAMEWORKS_WITHOUT_ONNX_DATA_TRANSFORM or X is None: return X try: if hasattr(self, "onnx_data_preprocessor") and isinstance( - self.onnx_data_preprocessor, ONNXTransformer + self.onnx_data_preprocessor, ONNXTransformer ): X = self.onnx_data_preprocessor.transform(X=X) @@ -772,8 +771,8 @@ def _onnx_data_transformer( X=X, impute_values=impute_values ) if ( - os.path.exists(os.path.join(self.artifact_dir, ONNX_DATA_TRANSFORMER)) - and not force_overwrite + os.path.exists(os.path.join(self.artifact_dir, ONNX_DATA_TRANSFORMER)) + and not force_overwrite ): raise ValueError( f"{ONNX_DATA_TRANSFORMER} already exists. " @@ -795,26 +794,26 @@ def _onnx_data_transformer( return X def prepare( - self, - inference_conda_env: str = None, - inference_python_version: str = None, - training_conda_env: str = None, - training_python_version: str = None, - model_file_name: str = None, - as_onnx: bool = False, - initial_types: List[Tuple] = None, - force_overwrite: bool = False, - namespace: str = CONDA_BUCKET_NS, - use_case_type: str = None, - X_sample: Union[list, tuple, pd.DataFrame, pd.Series, np.ndarray] = None, - y_sample: Union[list, tuple, pd.DataFrame, pd.Series, np.ndarray] = None, - training_script_path: str = None, - training_id: str = _TRAINING_RESOURCE_ID, - ignore_pending_changes: bool = True, - max_col_num: int = DATA_SCHEMA_MAX_COL_NUM, - ignore_conda_error: bool = False, - score_py_uri: str = None, - **kwargs: Dict, + self, + inference_conda_env: str = None, + inference_python_version: str = None, + training_conda_env: str = None, + training_python_version: str = None, + model_file_name: str = None, + as_onnx: bool = False, + initial_types: List[Tuple] = None, + force_overwrite: bool = False, + namespace: str = CONDA_BUCKET_NS, + use_case_type: str = None, + X_sample: Union[list, tuple, pd.DataFrame, pd.Series, np.ndarray] = None, + y_sample: Union[list, tuple, pd.DataFrame, pd.Series, np.ndarray] = None, + training_script_path: str = None, + training_id: str = _TRAINING_RESOURCE_ID, + ignore_pending_changes: bool = True, + max_col_num: int = DATA_SCHEMA_MAX_COL_NUM, + ignore_conda_error: bool = False, + score_py_uri: str = None, + **kwargs: Dict, ) -> "GenericModel": """Prepare and save the score.py, serialized model and runtime.yaml file. @@ -919,7 +918,7 @@ def prepare( self.properties.inference_python_version = ( manifest["python"] if "python" in manifest - and not self.properties.inference_python_version + and not self.properties.inference_python_version else self.properties.inference_python_version ) except: @@ -936,8 +935,8 @@ def prepare( as_onnx=as_onnx, model_file_name=model_file_name ) if ( - not isinstance(self.model_file_name, str) - or self.model_file_name.strip() == "" + not isinstance(self.model_file_name, str) + or self.model_file_name.strip() == "" ): raise ValueError("The `model_file_name` needs to be provided.") @@ -994,8 +993,8 @@ def prepare( ) except SerializeModelNotImplementedError as e: if not utils.is_path_exists( - uri=os.path.join(self.artifact_dir, self.model_file_name), - auth=self.auth, + uri=os.path.join(self.artifact_dir, self.model_file_name), + auth=self.auth, ): self._summary_status.update_action( detail="Serialized model", @@ -1089,7 +1088,7 @@ def prepare( return self def _handle_input_data( - self, data: Any = None, auto_serialize_data: bool = True, **kwargs + self, data: Any = None, auto_serialize_data: bool = True, **kwargs ): """Handle input data and serialize it as required. @@ -1194,11 +1193,11 @@ def get_model_serializer(self): return self.model_save_serializer def verify( - self, - data: Any = None, - reload_artifacts: bool = True, - auto_serialize_data: bool = False, - **kwargs, + self, + data: Any = None, + reload_artifacts: bool = True, + auto_serialize_data: bool = False, + **kwargs, ) -> Dict[str, Any]: """Test if deployment works in local environment. @@ -1280,15 +1279,15 @@ def introspect(self) -> pd.DataFrame: @classmethod def from_model_artifact( - cls: Type[Self], - uri: str, - model_file_name: str = None, - artifact_dir: Optional[str] = None, - auth: Optional[Dict] = None, - force_overwrite: Optional[bool] = False, - properties: Optional[ModelProperties] = None, - ignore_conda_error: Optional[bool] = False, - **kwargs: dict, + cls: Type[Self], + uri: str, + model_file_name: str = None, + artifact_dir: Optional[str] = None, + auth: Optional[Dict] = None, + force_overwrite: Optional[bool] = False, + properties: Optional[ModelProperties] = None, + ignore_conda_error: Optional[bool] = False, + **kwargs: dict, ) -> Self: """Loads model from a folder, or zip/tar archive. @@ -1327,9 +1326,9 @@ def from_model_artifact( If `model_file_name` not provided. """ if ( - cls._PREFIX is not "spark" - and artifact_dir - and ObjectStorageDetails.is_oci_path(artifact_dir) + cls._PREFIX is not "spark" + and artifact_dir + and ObjectStorageDetails.is_oci_path(artifact_dir) ): raise ValueError( f"Unsupported value of `artifact_dir`: {artifact_dir}. " @@ -1380,17 +1379,17 @@ def from_model_artifact( @classmethod def from_model_catalog( - cls: Type[Self], - model_id: str, - model_file_name: str = None, - artifact_dir: Optional[str] = None, - auth: Optional[Dict] = None, - force_overwrite: Optional[bool] = False, - properties: Optional[Union[ModelProperties, Dict]] = None, - bucket_uri: Optional[str] = None, - remove_existing_artifact: Optional[bool] = True, - ignore_conda_error: Optional[bool] = False, - **kwargs, + cls: Type[Self], + model_id: str, + model_file_name: str = None, + artifact_dir: Optional[str] = None, + auth: Optional[Dict] = None, + force_overwrite: Optional[bool] = False, + properties: Optional[Union[ModelProperties, Dict]] = None, + bucket_uri: Optional[str] = None, + remove_existing_artifact: Optional[bool] = True, + ignore_conda_error: Optional[bool] = False, + **kwargs, ) -> Self: """Loads model from model catalog. @@ -1693,17 +1692,17 @@ def update_deployment( @classmethod def from_id( - cls: Type[Self], - ocid: str, - model_file_name: str = None, - artifact_dir: Optional[str] = None, - auth: Optional[Dict] = None, - force_overwrite: Optional[bool] = False, - properties: Optional[Union[ModelProperties, Dict]] = None, - bucket_uri: Optional[str] = None, - remove_existing_artifact: Optional[bool] = True, - ignore_conda_error: Optional[bool] = False, - **kwargs, + cls: Type[Self], + ocid: str, + model_file_name: str = None, + artifact_dir: Optional[str] = None, + auth: Optional[Dict] = None, + force_overwrite: Optional[bool] = False, + properties: Optional[Union[ModelProperties, Dict]] = None, + bucket_uri: Optional[str] = None, + remove_existing_artifact: Optional[bool] = True, + ignore_conda_error: Optional[bool] = False, + **kwargs, ) -> Self: """Loads model from model OCID or model deployment OCID. @@ -1825,6 +1824,7 @@ def save( remove_existing_artifact: Optional[bool] = True, model_version_set: Optional[Union[str, ModelVersionSet]] = None, version_label: Optional[str] = None, + featurestore_dataset=None, **kwargs, ) -> str: """Saves model artifacts to the model catalog. @@ -1856,6 +1856,8 @@ def save( The model version set OCID, or model version set name, or `ModelVersionSet` instance. version_label: (str, optional). Defaults to None. The model version lebel. + featurestore_dataset: (Dataset, optional). + The feature store dataset kwargs: project_id: (str, optional). Project OCID. If not specified, the value will be taken either @@ -1887,7 +1889,7 @@ def save( # populates properties from args and kwargs. Empty values will be ignored. self.properties.with_dict(_extract_locals(locals())) self.properties.compartment_id = ( - self.properties.compartment_id or _COMPARTMENT_OCID + self.properties.compartment_id or _COMPARTMENT_OCID ) self.properties.project_id = self.properties.project_id or PROJECT_OCID @@ -1937,6 +1939,17 @@ def save( # variables in case of saving model in context of model version set. model_version_set_id = _extract_model_version_set_id(model_version_set) + + # TODO : optional code + if featurestore_dataset: + dataset_details = { + "feature-dataset-id": featurestore_dataset.id, + "feature-dataset-name-": featurestore_dataset.name + } + self.metadata_custom.add("featurestore.dataset", value=str(dataset_details), + category=MetadataCustomCategory.TRAINING_AND_VALIDATION_DATASETS, + description="feature store dataset", replace=True) + self.dsc_model = ( self.dsc_model.with_compartment_id(self.properties.compartment_id) .with_project_id(self.properties.project_id) @@ -1954,6 +1967,7 @@ def save( **kwargs, ) + self._summary_status.update_status( detail="Uploaded artifact to model catalog", status=ModelState.DONE.value ) @@ -1965,6 +1979,10 @@ def save( .with_infrastructure(ModelDeploymentInfrastructure()) .with_runtime(ModelDeploymentContainerRuntime()) ) + # Add the model id to the feature store dataset + if featurestore_dataset: + model_details = ModelDetails().with_items([self.model_id]) + featurestore_dataset.add_models(model_details) return self.model_id @@ -1979,21 +1997,21 @@ def _get_files(self): return get_files(self.artifact_dir, auth=self.auth) def deploy( - self, - wait_for_completion: Optional[bool] = True, - display_name: Optional[str] = None, - description: Optional[str] = None, - deployment_instance_shape: Optional[str] = None, - deployment_instance_subnet_id: Optional[str] = None, - deployment_instance_count: Optional[int] = None, - deployment_bandwidth_mbps: Optional[int] = None, - deployment_log_group_id: Optional[str] = None, - deployment_access_log_id: Optional[str] = None, - deployment_predict_log_id: Optional[str] = None, - deployment_memory_in_gbs: Optional[float] = None, - deployment_ocpus: Optional[float] = None, - deployment_image: Optional[str] = None, - **kwargs: Dict, + self, + wait_for_completion: Optional[bool] = True, + display_name: Optional[str] = None, + description: Optional[str] = None, + deployment_instance_shape: Optional[str] = None, + deployment_instance_subnet_id: Optional[str] = None, + deployment_instance_count: Optional[int] = None, + deployment_bandwidth_mbps: Optional[int] = None, + deployment_log_group_id: Optional[str] = None, + deployment_access_log_id: Optional[str] = None, + deployment_predict_log_id: Optional[str] = None, + deployment_memory_in_gbs: Optional[float] = None, + deployment_ocpus: Optional[float] = None, + deployment_image: Optional[str] = None, + **kwargs: Dict, ) -> "ModelDeployment": """ Deploys a model. The model needs to be saved to the model catalog at first. You can deploy the model @@ -2111,17 +2129,17 @@ def deploy( poll_interval = kwargs.pop("poll_interval", DEFAULT_POLL_INTERVAL) self.properties.compartment_id = ( - self.properties.compartment_id or _COMPARTMENT_OCID + self.properties.compartment_id or _COMPARTMENT_OCID ) self.properties.project_id = self.properties.project_id or PROJECT_OCID self.properties.deployment_instance_shape = ( - self.properties.deployment_instance_shape or MODEL_DEPLOYMENT_INSTANCE_SHAPE + self.properties.deployment_instance_shape or MODEL_DEPLOYMENT_INSTANCE_SHAPE ) self.properties.deployment_instance_count = ( - self.properties.deployment_instance_count or MODEL_DEPLOYMENT_INSTANCE_COUNT + self.properties.deployment_instance_count or MODEL_DEPLOYMENT_INSTANCE_COUNT ) self.properties.deployment_bandwidth_mbps = ( - self.properties.deployment_bandwidth_mbps or MODEL_DEPLOYMENT_BANDWIDTH_MBPS + self.properties.deployment_bandwidth_mbps or MODEL_DEPLOYMENT_BANDWIDTH_MBPS ) if not self.model_id: @@ -2131,8 +2149,8 @@ def deploy( ) if ( - self.properties.deployment_access_log_id - or self.properties.deployment_predict_log_id + self.properties.deployment_access_log_id + or self.properties.deployment_predict_log_id ) and not self.properties.deployment_log_group_id: raise ValueError( "`deployment_log_group_id` needs to be specified. " @@ -2144,11 +2162,11 @@ def deploy( existing_runtime = self.model_deployment.runtime web_concurrency = ( - kwargs.pop("web_concurrency", None) - or existing_infrastructure.web_concurrency + kwargs.pop("web_concurrency", None) + or existing_infrastructure.web_concurrency ) if not ( - self.properties.compartment_id or existing_infrastructure.compartment_id + self.properties.compartment_id or existing_infrastructure.compartment_id ): raise ValueError("`compartment_id` has to be provided.") if not (self.properties.project_id or existing_infrastructure.project_id): From 2e620e2145b621192ea60253a09029093fd9259f Mon Sep 17 00:00:00 2001 From: najiyacl Date: Fri, 14 Jul 2023 15:45:47 +0530 Subject: [PATCH 02/15] search models and update --- ads/feature_store/common/utils/utility.py | 71 ++++-- ads/feature_store/dataset.py | 16 +- ads/feature_store/mixin/oci_feature_store.py | 5 +- ads/model/generic_model.py | 247 ++++++++++--------- tests/integration/feature_store/test_base.py | 27 +- 5 files changed, 211 insertions(+), 155 deletions(-) diff --git a/ads/feature_store/common/utils/utility.py b/ads/feature_store/common/utils/utility.py index d98589de2..1fdeb9dc5 100644 --- a/ads/feature_store/common/utils/utility.py +++ b/ads/feature_store/common/utils/utility.py @@ -8,9 +8,11 @@ from typing import Union, List +import oci.regions from great_expectations.core import ExpectationSuite from ads.common.decorator.runtime_dependency import OptionalDependency +from ads.common.oci_resource import OCIResource, SEARCH_TYPE from ads.feature_store.common.utils.feature_schema_mapper import ( map_spark_type_to_feature_type, map_feature_type_to_pandas, @@ -19,7 +21,7 @@ from ads.feature_store.feature_group_expectation import Rule, Expectation from ads.feature_store.input_feature_detail import FeatureDetail from ads.feature_store.common.spark_session_singleton import SparkSessionSingleton -import re + try: from pyspark.pandas import DataFrame except ModuleNotFoundError: @@ -47,7 +49,7 @@ def get_execution_engine_type( - data_frame: Union[DataFrame, pd.DataFrame] + data_frame: Union[DataFrame, pd.DataFrame] ) -> ExecutionEngine: """ Determines the execution engine type for a given DataFrame. @@ -87,7 +89,7 @@ def get_metastore_id(feature_store_id: str): def validate_delta_format_parameters( - timestamp: datetime = None, version_number: int = None, is_restore: bool = False + timestamp: datetime = None, version_number: int = None, is_restore: bool = False ): """ Validate the user input provided as part of preview, restore APIs for ingested data, Ingested data is @@ -121,9 +123,9 @@ def validate_delta_format_parameters( def show_ingestion_summary( - entity_id: str, - entity_type: EntityType = EntityType.FEATURE_GROUP, - error_details: str = None, + entity_id: str, + entity_type: EntityType = EntityType.FEATURE_GROUP, + error_details: str = None, ): """ Displays a ingestion summary table with the given entity type and error details. @@ -163,7 +165,7 @@ def show_validation_summary(ingestion_status: str, validation_output, expectatio statistics = validation_output["statistics"] table_headers = ( - ["expectation_type"] + list(statistics.keys()) + ["ingestion_status"] + ["expectation_type"] + list(statistics.keys()) + ["ingestion_status"] ) table_values = [expectation_type] + list(statistics.values()) + [ingestion_status] @@ -207,9 +209,9 @@ def show_validation_summary(ingestion_status: str, validation_output, expectatio def get_features( - output_columns: List[dict], - parent_id: str, - entity_type: EntityType = EntityType.FEATURE_GROUP, + output_columns: List[dict], + parent_id: str, + entity_type: EntityType = EntityType.FEATURE_GROUP, ) -> List[Feature]: """ Returns a list of features, given a list of output_columns and a feature_group_id. @@ -266,7 +268,7 @@ def get_schema_from_spark_df(df: DataFrame): def get_schema_from_df( - data_frame: Union[DataFrame, pd.DataFrame], feature_store_id: str + data_frame: Union[DataFrame, pd.DataFrame], feature_store_id: str ) -> List[dict]: """ Given a DataFrame, returns a list of dictionaries that describe its schema. @@ -280,7 +282,7 @@ def get_schema_from_df( def get_input_features_from_df( - data_frame: Union[DataFrame, pd.DataFrame], feature_store_id: str + data_frame: Union[DataFrame, pd.DataFrame], feature_store_id: str ) -> List[FeatureDetail]: """ Given a DataFrame, returns a list of FeatureDetail objects that represent its input features. @@ -297,7 +299,7 @@ def get_input_features_from_df( def convert_expectation_suite_to_expectation( - expectation_suite: ExpectationSuite, expectation_type: ExpectationType + expectation_suite: ExpectationSuite, expectation_type: ExpectationType ): """ Convert an ExpectationSuite object to an Expectation object with detailed rule information. @@ -356,7 +358,7 @@ def largest_matching_subset_of_primary_keys(left_feature_group, right_feature_gr def convert_pandas_datatype_with_schema( - raw_feature_details: List[dict], input_df: pd.DataFrame + raw_feature_details: List[dict], input_df: pd.DataFrame ) -> pd.DataFrame: feature_detail_map = {} columns_to_remove = [] @@ -381,7 +383,7 @@ def convert_pandas_datatype_with_schema( def convert_spark_dataframe_with_schema( - raw_feature_details: List[dict], input_df: DataFrame + raw_feature_details: List[dict], input_df: DataFrame ) -> DataFrame: feature_detail_map = {} columns_to_remove = [] @@ -403,10 +405,35 @@ def validate_input_feature_details(input_feature_details, data_frame): return convert_spark_dataframe_with_schema(input_feature_details, data_frame) -def validate_model_ocid(model_ocid): - pattern = r'^ocid1\.datasciencemodel\.oc(?P[0-17]+)\.(?P[A-Za-z0-9]+)?\.?(?P[A-Za-z0-9]+)?\.(?P[A-Za-z0-9]+)$' - match = re.match(pattern, model_ocid) - if match: - # groups = match.groupdict() - return True - return False +def validate_model_ocid_format(model_ocid): + split_words = model_ocid.split('.') + region = split_words[3] + realm = split_words[2] + print(split_words) + # region = auth.get("signer").region will not work for config + # TODO: try to get current region if possible?? + if region in oci.regions.REGIONS_SHORT_NAMES: + region = oci.regions.REGIONS_SHORT_NAMES[region] + elif region not in oci.regions.REGIONS: + return False + if realm not in oci.regions.REGION_REALMS[region]: + return False + return True + + +def search_model_ocids(model_ids: list) -> list: + query = "query datasciencemodel resources where " + items = model_ids + for item in items: + query = query + f"identifier='{item}'||" + list_models = OCIResource.search( + query[:-2] + , type=SEARCH_TYPE.STRUCTURED, + ) + list_models_ids = [] + for model in list_models: + list_models_ids.append(model.identifier) + for model_id in model_ids: + if model_id not in list_models_ids: + logger.warning(model_id + " doesnt exist") + return list_models_ids diff --git a/ads/feature_store/dataset.py b/ads/feature_store/dataset.py index 0bf0aba18..18e66c2ff 100644 --- a/ads/feature_store/dataset.py +++ b/ads/feature_store/dataset.py @@ -19,6 +19,7 @@ get_metastore_id, validate_delta_format_parameters, convert_expectation_suite_to_expectation, + validate_model_ocid_format, search_model_ocids ) from ads.feature_store.dataset_job import DatasetJob, IngestionMode from ads.feature_store.execution_strategy.engine.spark_engine import SparkEngine @@ -481,12 +482,13 @@ def model_details(self) -> "ModelDetails": def model_details(self, model_details: ModelDetails): self.with_model_details(model_details) - def with_model_details(self, model_details: ModelDetails) -> "Dataset": + def with_model_details(self, model_details: ModelDetails,validate: bool = True) -> "Dataset": """Sets the model details for the dataset. Parameters ---------- model_details: ModelDetails + validate: if to validate the model ids Returns ------- @@ -498,6 +500,18 @@ def with_model_details(self, model_details: ModelDetails) -> "Dataset": "The argument `model_details` has to be of type `ModelDetails`" "but is of type: `{}`".format(type(model_details)) ) + #TODO: we could eiher use validate_model_ocid_format if thats enough or search_model_ocids + #search_model_ocids should be used after set_auth without url + # items = model_details["items"] + # for item in items: + # if not validate_model_ocid(item): + # raise ValueError( + # f"the model ocid {item} is not valid" + # ) + if validate: + model_ids_list = search_model_ocids(model_details.items) + model_details = ModelDetails(model_ids_list) + return self.set_spec(self.CONST_MODEL_DETAILS, model_details.to_dict()) def add_models(self, model_details: ModelDetails) -> "Dataset": diff --git a/ads/feature_store/mixin/oci_feature_store.py b/ads/feature_store/mixin/oci_feature_store.py index 09a2ff108..0914de19a 100644 --- a/ads/feature_store/mixin/oci_feature_store.py +++ b/ads/feature_store/mixin/oci_feature_store.py @@ -15,9 +15,10 @@ def init_client( cls, **kwargs ) -> oci.feature_store.feature_store_client.FeatureStoreClient: + # TODO: Getting the endpoint from authorizer fs_service_endpoint = os.environ.get("OCI_FS_SERVICE_ENDPOINT") - kwargs = {"service_endpoint": fs_service_endpoint} - + if fs_service_endpoint: + kwargs = {"service_endpoint": fs_service_endpoint} client = cls._init_client( client=oci.feature_store.feature_store_client.FeatureStoreClient, **kwargs diff --git a/ads/model/generic_model.py b/ads/model/generic_model.py index ae517c886..4eac9882d 100644 --- a/ads/model/generic_model.py +++ b/ads/model/generic_model.py @@ -34,6 +34,7 @@ from ads.evaluations import EvaluatorMixin from ads.feature_engineering import ADSImage from ads.feature_engineering.schema import Schema +from ads.feature_store.model_details import ModelDetails from ads.model.artifact import ModelArtifact from ads.model.common.utils import ( _extract_locals, @@ -99,6 +100,7 @@ MODEL_DEPLOYMENT_INSTANCE_COUNT = 1 MODEL_DEPLOYMENT_BANDWIDTH_MBPS = 10 + DEFAULT_MODEL_FOLDER_NAME = "model" ONNX_DATA_TRANSFORMER = "onnx_data_transformer.json" @@ -135,7 +137,7 @@ class DataScienceModelType(str, metaclass=ExtendedEnumMeta): MODEL = "datasciencemodel" -class NotActiveDeploymentError(Exception): # pragma: no cover +class NotActiveDeploymentError(Exception): # pragma: no cover def __init__(self, state: str): msg = ( "To perform a prediction the deployed model needs to be in an active state. " @@ -144,15 +146,15 @@ def __init__(self, state: str): super().__init__(msg) -class SerializeModelNotImplementedError(NotImplementedError): # pragma: no cover +class SerializeModelNotImplementedError(NotImplementedError): # pragma: no cover pass -class SerializeInputNotImplementedError(NotImplementedError): # pragma: no cover +class SerializeInputNotImplementedError(NotImplementedError): # pragma: no cover pass -class RuntimeInfoInconsistencyError(Exception): # pragma: no cover +class RuntimeInfoInconsistencyError(Exception): # pragma: no cover pass @@ -348,9 +350,9 @@ def __init__( Instance of ads.model.SERDE. Used for serialize/deserialize model input. """ if ( - artifact_dir - and ObjectStorageDetails.is_oci_path(artifact_dir) - and not self._PREFIX == "spark" + artifact_dir + and ObjectStorageDetails.is_oci_path(artifact_dir) + and not self._PREFIX == "spark" ): raise ValueError( f"Unsupported value of `artifact_dir`: {artifact_dir}. " @@ -415,9 +417,9 @@ def __init__( self.ignore_conda_error = False def _init_serde( - self, - model_input_serde: Union[SERDE, str] = None, - model_save_serializer: Union[SERDE, str] = None, + self, + model_input_serde: Union[SERDE, str] = None, + model_save_serializer: Union[SERDE, str] = None, ): """Initializes serde. @@ -512,8 +514,8 @@ def _to_yaml(self): return yaml.safe_dump(self._to_dict()) def set_model_input_serializer( - self, - model_input_serializer: Union[str, SERDE], + self, + model_input_serializer: Union[str, SERDE], ): """Registers serializer used for serializing data passed in verify/predict. @@ -645,12 +647,12 @@ def set_model_save_serializer(self, model_save_serializer: Union[str, SERDE]): ) def serialize_model( - self, - as_onnx: bool = False, - initial_types: List[Tuple] = None, - force_overwrite: bool = False, - X_sample: any = None, - **kwargs, + self, + as_onnx: bool = False, + initial_types: List[Tuple] = None, + force_overwrite: bool = False, + X_sample: any = None, + **kwargs, ): """ Serialize and save model using ONNX or model specific method. @@ -688,11 +690,11 @@ def serialize_model( ) def _serialize_model_helper( - self, - initial_types: List[Tuple] = None, - force_overwrite: bool = False, - X_sample: any = None, - **kwargs, + self, + initial_types: List[Tuple] = None, + force_overwrite: bool = False, + X_sample: any = None, + **kwargs, ): model_path = self._check_model_file( self.model_file_name, force_overwrite=force_overwrite @@ -752,17 +754,17 @@ def _set_model_save_serializer_to_onnx(self): raise e def _onnx_data_transformer( - self, - X: Union[pd.DataFrame, pd.Series], - impute_values: Dict = None, - force_overwrite: bool = False, + self, + X: Union[pd.DataFrame, pd.Series], + impute_values: Dict = None, + force_overwrite: bool = False, ): """Apply onnx data transformer to data.""" if self.framework in FRAMEWORKS_WITHOUT_ONNX_DATA_TRANSFORM or X is None: return X try: if hasattr(self, "onnx_data_preprocessor") and isinstance( - self.onnx_data_preprocessor, ONNXTransformer + self.onnx_data_preprocessor, ONNXTransformer ): X = self.onnx_data_preprocessor.transform(X=X) @@ -771,8 +773,8 @@ def _onnx_data_transformer( X=X, impute_values=impute_values ) if ( - os.path.exists(os.path.join(self.artifact_dir, ONNX_DATA_TRANSFORMER)) - and not force_overwrite + os.path.exists(os.path.join(self.artifact_dir, ONNX_DATA_TRANSFORMER)) + and not force_overwrite ): raise ValueError( f"{ONNX_DATA_TRANSFORMER} already exists. " @@ -794,26 +796,26 @@ def _onnx_data_transformer( return X def prepare( - self, - inference_conda_env: str = None, - inference_python_version: str = None, - training_conda_env: str = None, - training_python_version: str = None, - model_file_name: str = None, - as_onnx: bool = False, - initial_types: List[Tuple] = None, - force_overwrite: bool = False, - namespace: str = CONDA_BUCKET_NS, - use_case_type: str = None, - X_sample: Union[list, tuple, pd.DataFrame, pd.Series, np.ndarray] = None, - y_sample: Union[list, tuple, pd.DataFrame, pd.Series, np.ndarray] = None, - training_script_path: str = None, - training_id: str = _TRAINING_RESOURCE_ID, - ignore_pending_changes: bool = True, - max_col_num: int = DATA_SCHEMA_MAX_COL_NUM, - ignore_conda_error: bool = False, - score_py_uri: str = None, - **kwargs: Dict, + self, + inference_conda_env: str = None, + inference_python_version: str = None, + training_conda_env: str = None, + training_python_version: str = None, + model_file_name: str = None, + as_onnx: bool = False, + initial_types: List[Tuple] = None, + force_overwrite: bool = False, + namespace: str = CONDA_BUCKET_NS, + use_case_type: str = None, + X_sample: Union[list, tuple, pd.DataFrame, pd.Series, np.ndarray] = None, + y_sample: Union[list, tuple, pd.DataFrame, pd.Series, np.ndarray] = None, + training_script_path: str = None, + training_id: str = _TRAINING_RESOURCE_ID, + ignore_pending_changes: bool = True, + max_col_num: int = DATA_SCHEMA_MAX_COL_NUM, + ignore_conda_error: bool = False, + score_py_uri: str = None, + **kwargs: Dict, ) -> "GenericModel": """Prepare and save the score.py, serialized model and runtime.yaml file. @@ -918,7 +920,7 @@ def prepare( self.properties.inference_python_version = ( manifest["python"] if "python" in manifest - and not self.properties.inference_python_version + and not self.properties.inference_python_version else self.properties.inference_python_version ) except: @@ -935,8 +937,8 @@ def prepare( as_onnx=as_onnx, model_file_name=model_file_name ) if ( - not isinstance(self.model_file_name, str) - or self.model_file_name.strip() == "" + not isinstance(self.model_file_name, str) + or self.model_file_name.strip() == "" ): raise ValueError("The `model_file_name` needs to be provided.") @@ -993,8 +995,8 @@ def prepare( ) except SerializeModelNotImplementedError as e: if not utils.is_path_exists( - uri=os.path.join(self.artifact_dir, self.model_file_name), - auth=self.auth, + uri=os.path.join(self.artifact_dir, self.model_file_name), + auth=self.auth, ): self._summary_status.update_action( detail="Serialized model", @@ -1088,7 +1090,7 @@ def prepare( return self def _handle_input_data( - self, data: Any = None, auto_serialize_data: bool = True, **kwargs + self, data: Any = None, auto_serialize_data: bool = True, **kwargs ): """Handle input data and serialize it as required. @@ -1193,11 +1195,11 @@ def get_model_serializer(self): return self.model_save_serializer def verify( - self, - data: Any = None, - reload_artifacts: bool = True, - auto_serialize_data: bool = False, - **kwargs, + self, + data: Any = None, + reload_artifacts: bool = True, + auto_serialize_data: bool = False, + **kwargs, ) -> Dict[str, Any]: """Test if deployment works in local environment. @@ -1279,15 +1281,15 @@ def introspect(self) -> pd.DataFrame: @classmethod def from_model_artifact( - cls: Type[Self], - uri: str, - model_file_name: str = None, - artifact_dir: Optional[str] = None, - auth: Optional[Dict] = None, - force_overwrite: Optional[bool] = False, - properties: Optional[ModelProperties] = None, - ignore_conda_error: Optional[bool] = False, - **kwargs: dict, + cls: Type[Self], + uri: str, + model_file_name: str = None, + artifact_dir: Optional[str] = None, + auth: Optional[Dict] = None, + force_overwrite: Optional[bool] = False, + properties: Optional[ModelProperties] = None, + ignore_conda_error: Optional[bool] = False, + **kwargs: dict, ) -> Self: """Loads model from a folder, or zip/tar archive. @@ -1326,9 +1328,9 @@ def from_model_artifact( If `model_file_name` not provided. """ if ( - cls._PREFIX is not "spark" - and artifact_dir - and ObjectStorageDetails.is_oci_path(artifact_dir) + cls._PREFIX is not "spark" + and artifact_dir + and ObjectStorageDetails.is_oci_path(artifact_dir) ): raise ValueError( f"Unsupported value of `artifact_dir`: {artifact_dir}. " @@ -1379,17 +1381,17 @@ def from_model_artifact( @classmethod def from_model_catalog( - cls: Type[Self], - model_id: str, - model_file_name: str = None, - artifact_dir: Optional[str] = None, - auth: Optional[Dict] = None, - force_overwrite: Optional[bool] = False, - properties: Optional[Union[ModelProperties, Dict]] = None, - bucket_uri: Optional[str] = None, - remove_existing_artifact: Optional[bool] = True, - ignore_conda_error: Optional[bool] = False, - **kwargs, + cls: Type[Self], + model_id: str, + model_file_name: str = None, + artifact_dir: Optional[str] = None, + auth: Optional[Dict] = None, + force_overwrite: Optional[bool] = False, + properties: Optional[Union[ModelProperties, Dict]] = None, + bucket_uri: Optional[str] = None, + remove_existing_artifact: Optional[bool] = True, + ignore_conda_error: Optional[bool] = False, + **kwargs, ) -> Self: """Loads model from model catalog. @@ -1692,17 +1694,17 @@ def update_deployment( @classmethod def from_id( - cls: Type[Self], - ocid: str, - model_file_name: str = None, - artifact_dir: Optional[str] = None, - auth: Optional[Dict] = None, - force_overwrite: Optional[bool] = False, - properties: Optional[Union[ModelProperties, Dict]] = None, - bucket_uri: Optional[str] = None, - remove_existing_artifact: Optional[bool] = True, - ignore_conda_error: Optional[bool] = False, - **kwargs, + cls: Type[Self], + ocid: str, + model_file_name: str = None, + artifact_dir: Optional[str] = None, + auth: Optional[Dict] = None, + force_overwrite: Optional[bool] = False, + properties: Optional[Union[ModelProperties, Dict]] = None, + bucket_uri: Optional[str] = None, + remove_existing_artifact: Optional[bool] = True, + ignore_conda_error: Optional[bool] = False, + **kwargs, ) -> Self: """Loads model from model OCID or model deployment OCID. @@ -1889,7 +1891,7 @@ def save( # populates properties from args and kwargs. Empty values will be ignored. self.properties.with_dict(_extract_locals(locals())) self.properties.compartment_id = ( - self.properties.compartment_id or _COMPARTMENT_OCID + self.properties.compartment_id or _COMPARTMENT_OCID ) self.properties.project_id = self.properties.project_id or PROJECT_OCID @@ -1939,7 +1941,6 @@ def save( # variables in case of saving model in context of model version set. model_version_set_id = _extract_model_version_set_id(model_version_set) - # TODO : optional code if featurestore_dataset: dataset_details = { @@ -1967,7 +1968,6 @@ def save( **kwargs, ) - self._summary_status.update_status( detail="Uploaded artifact to model catalog", status=ModelState.DONE.value ) @@ -1982,7 +1982,8 @@ def save( # Add the model id to the feature store dataset if featurestore_dataset: model_details = ModelDetails().with_items([self.model_id]) - featurestore_dataset.add_models(model_details) + featurestore_dataset.with_model_details(model_details, False) + featurestore_dataset.update() return self.model_id @@ -1997,21 +1998,21 @@ def _get_files(self): return get_files(self.artifact_dir, auth=self.auth) def deploy( - self, - wait_for_completion: Optional[bool] = True, - display_name: Optional[str] = None, - description: Optional[str] = None, - deployment_instance_shape: Optional[str] = None, - deployment_instance_subnet_id: Optional[str] = None, - deployment_instance_count: Optional[int] = None, - deployment_bandwidth_mbps: Optional[int] = None, - deployment_log_group_id: Optional[str] = None, - deployment_access_log_id: Optional[str] = None, - deployment_predict_log_id: Optional[str] = None, - deployment_memory_in_gbs: Optional[float] = None, - deployment_ocpus: Optional[float] = None, - deployment_image: Optional[str] = None, - **kwargs: Dict, + self, + wait_for_completion: Optional[bool] = True, + display_name: Optional[str] = None, + description: Optional[str] = None, + deployment_instance_shape: Optional[str] = None, + deployment_instance_subnet_id: Optional[str] = None, + deployment_instance_count: Optional[int] = None, + deployment_bandwidth_mbps: Optional[int] = None, + deployment_log_group_id: Optional[str] = None, + deployment_access_log_id: Optional[str] = None, + deployment_predict_log_id: Optional[str] = None, + deployment_memory_in_gbs: Optional[float] = None, + deployment_ocpus: Optional[float] = None, + deployment_image: Optional[str] = None, + **kwargs: Dict, ) -> "ModelDeployment": """ Deploys a model. The model needs to be saved to the model catalog at first. You can deploy the model @@ -2129,17 +2130,17 @@ def deploy( poll_interval = kwargs.pop("poll_interval", DEFAULT_POLL_INTERVAL) self.properties.compartment_id = ( - self.properties.compartment_id or _COMPARTMENT_OCID + self.properties.compartment_id or _COMPARTMENT_OCID ) self.properties.project_id = self.properties.project_id or PROJECT_OCID self.properties.deployment_instance_shape = ( - self.properties.deployment_instance_shape or MODEL_DEPLOYMENT_INSTANCE_SHAPE + self.properties.deployment_instance_shape or MODEL_DEPLOYMENT_INSTANCE_SHAPE ) self.properties.deployment_instance_count = ( - self.properties.deployment_instance_count or MODEL_DEPLOYMENT_INSTANCE_COUNT + self.properties.deployment_instance_count or MODEL_DEPLOYMENT_INSTANCE_COUNT ) self.properties.deployment_bandwidth_mbps = ( - self.properties.deployment_bandwidth_mbps or MODEL_DEPLOYMENT_BANDWIDTH_MBPS + self.properties.deployment_bandwidth_mbps or MODEL_DEPLOYMENT_BANDWIDTH_MBPS ) if not self.model_id: @@ -2149,8 +2150,8 @@ def deploy( ) if ( - self.properties.deployment_access_log_id - or self.properties.deployment_predict_log_id + self.properties.deployment_access_log_id + or self.properties.deployment_predict_log_id ) and not self.properties.deployment_log_group_id: raise ValueError( "`deployment_log_group_id` needs to be specified. " @@ -2162,11 +2163,11 @@ def deploy( existing_runtime = self.model_deployment.runtime web_concurrency = ( - kwargs.pop("web_concurrency", None) - or existing_infrastructure.web_concurrency + kwargs.pop("web_concurrency", None) + or existing_infrastructure.web_concurrency ) if not ( - self.properties.compartment_id or existing_infrastructure.compartment_id + self.properties.compartment_id or existing_infrastructure.compartment_id ): raise ValueError("`compartment_id` has to be provided.") if not (self.properties.project_id or existing_infrastructure.project_id): diff --git a/tests/integration/feature_store/test_base.py b/tests/integration/feature_store/test_base.py index fb6d81efa..d141bfc41 100644 --- a/tests/integration/feature_store/test_base.py +++ b/tests/integration/feature_store/test_base.py @@ -19,11 +19,21 @@ from ads.feature_store.statistics_config import StatisticsConfig -client_kwargs = dict( - retry_strategy=oci.retry.NoneRetryStrategy, - service_endpoint=os.getenv("service_endpoint"), -) -ads.set_auth(client_kwargs=client_kwargs) +# client_kwargs = dict( +# retry_strategy=oci.retry.NoneRetryStrategy, +# service_endpoint=os.getenv("service_endpoint"), +# ) +# ads.set_auth(client_kwargs=client_kwargs) + +# validate_model_ocid("ocid1.datasciencemodel.oc1.iad.amaaaaaaqc2qulqa7ribuimhibly3a3l5v5eem7akg7xeycutckdvusok2la") +os.environ["OCI_FS_SERVICE_ENDPOINT"] = "http://localhost:21000/20230101" + +# ads.set_auth(auth="api_key", client_kwargs={"service_endpoint": "http://localhost:21000/20230101"}) +# Set the environment variable +os.environ['DEVELOPER_MODE'] = "True" +os.environ['JAVA_HOME'] = "/Library/Java/JavaVirtualMachines/jdk-11.0.14.jdk/Contents/Home" +os.environ['DEVELOPER_MODE'] = "True" + try: from ads.feature_store.feature_store import FeatureStore @@ -38,8 +48,11 @@ class FeatureStoreTestCase: # networks compartment in feature store TIME_NOW = datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S") TENANCY_ID = "ocid1.tenancy.oc1..aaaaaaaa462hfhplpx652b32ix62xrdijppq2c7okwcqjlgrbknhgtj2kofa" - COMPARTMENT_ID = "ocid1.tenancy.oc1..aaaaaaaa462hfhplpx652b32ix62xrdijppq2c7okwcqjlgrbknhgtj2kofa" - METASTORE_ID = "ocid1.datacatalogmetastore.oc1.iad.amaaaaaabiudgxyap7tizm4gscwz7amu7dixz7ml3mtesqzzwwg3urvvdgua" + # COMPARTMENT_ID = "ocid1.tenancy.oc1..aaaaaaaa462hfhplpx652b32ix62xrdijppq2c7okwcqjlgrbknhgtj2kofa" + # METASTORE_ID = "ocid1.datacatalogmetastore.oc1.iad.amaaaaaabiudgxyap7tizm4gscwz7amu7dixz7ml3mtesqzzwwg3urvvdgua" + COMPARTMENT_ID = "ocid1.tenancy.oc1..aaaaaaaap4gumyssud5udgkbyb2nv5y3vzrkgsnskeqc77dopagutu25sjhq" + METASTORE_ID = "ocid1.datacatalogmetastore.oc1.iad.amaaaaaaqc2qulqav5pzijun724nglvsent3634hqrc2ybu5vfi3fu35tkyq" + INPUT_FEATURE_DETAILS = [ FeatureDetail("flower") .with_feature_type(FeatureType.STRING) From 44eaf5ede46af45fbe06f39253de440420967c3d Mon Sep 17 00:00:00 2001 From: najiyacl Date: Fri, 14 Jul 2023 15:53:54 +0530 Subject: [PATCH 03/15] reverting local changes --- tests/integration/feature_store/test_base.py | 27 +++++--------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/tests/integration/feature_store/test_base.py b/tests/integration/feature_store/test_base.py index d141bfc41..fb6d81efa 100644 --- a/tests/integration/feature_store/test_base.py +++ b/tests/integration/feature_store/test_base.py @@ -19,21 +19,11 @@ from ads.feature_store.statistics_config import StatisticsConfig -# client_kwargs = dict( -# retry_strategy=oci.retry.NoneRetryStrategy, -# service_endpoint=os.getenv("service_endpoint"), -# ) -# ads.set_auth(client_kwargs=client_kwargs) - -# validate_model_ocid("ocid1.datasciencemodel.oc1.iad.amaaaaaaqc2qulqa7ribuimhibly3a3l5v5eem7akg7xeycutckdvusok2la") -os.environ["OCI_FS_SERVICE_ENDPOINT"] = "http://localhost:21000/20230101" - -# ads.set_auth(auth="api_key", client_kwargs={"service_endpoint": "http://localhost:21000/20230101"}) -# Set the environment variable -os.environ['DEVELOPER_MODE'] = "True" -os.environ['JAVA_HOME'] = "/Library/Java/JavaVirtualMachines/jdk-11.0.14.jdk/Contents/Home" -os.environ['DEVELOPER_MODE'] = "True" - +client_kwargs = dict( + retry_strategy=oci.retry.NoneRetryStrategy, + service_endpoint=os.getenv("service_endpoint"), +) +ads.set_auth(client_kwargs=client_kwargs) try: from ads.feature_store.feature_store import FeatureStore @@ -48,11 +38,8 @@ class FeatureStoreTestCase: # networks compartment in feature store TIME_NOW = datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S") TENANCY_ID = "ocid1.tenancy.oc1..aaaaaaaa462hfhplpx652b32ix62xrdijppq2c7okwcqjlgrbknhgtj2kofa" - # COMPARTMENT_ID = "ocid1.tenancy.oc1..aaaaaaaa462hfhplpx652b32ix62xrdijppq2c7okwcqjlgrbknhgtj2kofa" - # METASTORE_ID = "ocid1.datacatalogmetastore.oc1.iad.amaaaaaabiudgxyap7tizm4gscwz7amu7dixz7ml3mtesqzzwwg3urvvdgua" - COMPARTMENT_ID = "ocid1.tenancy.oc1..aaaaaaaap4gumyssud5udgkbyb2nv5y3vzrkgsnskeqc77dopagutu25sjhq" - METASTORE_ID = "ocid1.datacatalogmetastore.oc1.iad.amaaaaaaqc2qulqav5pzijun724nglvsent3634hqrc2ybu5vfi3fu35tkyq" - + COMPARTMENT_ID = "ocid1.tenancy.oc1..aaaaaaaa462hfhplpx652b32ix62xrdijppq2c7okwcqjlgrbknhgtj2kofa" + METASTORE_ID = "ocid1.datacatalogmetastore.oc1.iad.amaaaaaabiudgxyap7tizm4gscwz7amu7dixz7ml3mtesqzzwwg3urvvdgua" INPUT_FEATURE_DETAILS = [ FeatureDetail("flower") .with_feature_type(FeatureType.STRING) From 57ce096a99324cc3642de2c2abf24df24df7d787 Mon Sep 17 00:00:00 2001 From: najiyacl Date: Fri, 21 Jul 2023 08:59:36 +0530 Subject: [PATCH 04/15] remove the search model ids --- ads/feature_store/common/utils/utility.py | 36 +---------------------- ads/feature_store/dataset.py | 29 ++++++++---------- 2 files changed, 13 insertions(+), 52 deletions(-) diff --git a/ads/feature_store/common/utils/utility.py b/ads/feature_store/common/utils/utility.py index 1fdeb9dc5..fac3b9708 100644 --- a/ads/feature_store/common/utils/utility.py +++ b/ads/feature_store/common/utils/utility.py @@ -402,38 +402,4 @@ def convert_spark_dataframe_with_schema( def validate_input_feature_details(input_feature_details, data_frame): if isinstance(data_frame, pd.DataFrame): return convert_pandas_datatype_with_schema(input_feature_details, data_frame) - return convert_spark_dataframe_with_schema(input_feature_details, data_frame) - - -def validate_model_ocid_format(model_ocid): - split_words = model_ocid.split('.') - region = split_words[3] - realm = split_words[2] - print(split_words) - # region = auth.get("signer").region will not work for config - # TODO: try to get current region if possible?? - if region in oci.regions.REGIONS_SHORT_NAMES: - region = oci.regions.REGIONS_SHORT_NAMES[region] - elif region not in oci.regions.REGIONS: - return False - if realm not in oci.regions.REGION_REALMS[region]: - return False - return True - - -def search_model_ocids(model_ids: list) -> list: - query = "query datasciencemodel resources where " - items = model_ids - for item in items: - query = query + f"identifier='{item}'||" - list_models = OCIResource.search( - query[:-2] - , type=SEARCH_TYPE.STRUCTURED, - ) - list_models_ids = [] - for model in list_models: - list_models_ids.append(model.identifier) - for model_id in model_ids: - if model_id not in list_models_ids: - logger.warning(model_id + " doesnt exist") - return list_models_ids + return convert_spark_dataframe_with_schema(input_feature_details, data_frame) \ No newline at end of file diff --git a/ads/feature_store/dataset.py b/ads/feature_store/dataset.py index 18e66c2ff..c436b8103 100644 --- a/ads/feature_store/dataset.py +++ b/ads/feature_store/dataset.py @@ -18,8 +18,7 @@ from ads.feature_store.common.utils.utility import ( get_metastore_id, validate_delta_format_parameters, - convert_expectation_suite_to_expectation, - validate_model_ocid_format, search_model_ocids + convert_expectation_suite_to_expectation ) from ads.feature_store.dataset_job import DatasetJob, IngestionMode from ads.feature_store.execution_strategy.engine.spark_engine import SparkEngine @@ -375,7 +374,7 @@ def expectation_details(self) -> "Expectation": return self.get_spec(self.CONST_EXPECTATION_DETAILS) def with_expectation_suite( - self, expectation_suite: ExpectationSuite, expectation_type: ExpectationType + self, expectation_suite: ExpectationSuite, expectation_type: ExpectationType ) -> "Dataset": """Sets the expectation details for the feature group. @@ -446,7 +445,7 @@ def statistics_config(self, statistics_config: StatisticsConfig): self.with_statistics_config(statistics_config) def with_statistics_config( - self, statistics_config: Union[StatisticsConfig, bool] + self, statistics_config: Union[StatisticsConfig, bool] ) -> "Dataset": """Sets the statistics details for the dataset. @@ -482,7 +481,7 @@ def model_details(self) -> "ModelDetails": def model_details(self, model_details: ModelDetails): self.with_model_details(model_details) - def with_model_details(self, model_details: ModelDetails,validate: bool = True) -> "Dataset": + def with_model_details(self, model_details: ModelDetails) -> "Dataset": """Sets the model details for the dataset. Parameters @@ -500,17 +499,6 @@ def with_model_details(self, model_details: ModelDetails,validate: bool = True) "The argument `model_details` has to be of type `ModelDetails`" "but is of type: `{}`".format(type(model_details)) ) - #TODO: we could eiher use validate_model_ocid_format if thats enough or search_model_ocids - #search_model_ocids should be used after set_auth without url - # items = model_details["items"] - # for item in items: - # if not validate_model_ocid(item): - # raise ValueError( - # f"the model ocid {item} is not valid" - # ) - if validate: - model_ids_list = search_model_ocids(model_details.items) - model_details = ModelDetails(model_ids_list) return self.set_spec(self.CONST_MODEL_DETAILS, model_details.to_dict()) @@ -532,7 +520,14 @@ def add_models(self, model_details: ModelDetails) -> "Dataset": for item in items: model_details.items.append(item) self.with_model_details(model_details) - return self.update() + self.update() + + self._update_from_oci_dataset_model(self.oci_dataset) + updated_model_details = self.model_details + if updated_model_details and updated_model_details.items and model_details: + for model_id in model_details.items: + if model_id not in updated_model_details["items"]: + logger.warning("Model Id" + model_id + " doesnt exist or you do not have the permission") def remove_models(self, model_details: ModelDetails) -> "Dataset": """remove model details from the dataset, remove from the existing dataset model id list From 7fd601a13bc0e8354f7a97c049c4bfe499997f7e Mon Sep 17 00:00:00 2001 From: najiyacl Date: Fri, 21 Jul 2023 13:38:22 +0530 Subject: [PATCH 05/15] minor changes --- ads/feature_store/dataset.py | 2 +- ads/model/generic_model.py | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/ads/feature_store/dataset.py b/ads/feature_store/dataset.py index 6f108b8f6..c786d4760 100644 --- a/ads/feature_store/dataset.py +++ b/ads/feature_store/dataset.py @@ -527,7 +527,7 @@ def add_models(self, model_details: ModelDetails) -> "Dataset": if updated_model_details and updated_model_details.items and model_details: for model_id in model_details.items: if model_id not in updated_model_details["items"]: - logger.warning("Model Id" + model_id + " doesnt exist or you do not have the permission") + logger.warning(f"Either model with Id '{model_id}' doesnt exist or unable to validate") def remove_models(self, model_details: ModelDetails) -> "Dataset": """remove model details from the dataset, remove from the existing dataset model id list diff --git a/ads/model/generic_model.py b/ads/model/generic_model.py index 4eac9882d..068889525 100644 --- a/ads/model/generic_model.py +++ b/ads/model/generic_model.py @@ -1941,11 +1941,10 @@ def save( # variables in case of saving model in context of model version set. model_version_set_id = _extract_model_version_set_id(model_version_set) - # TODO : optional code if featurestore_dataset: dataset_details = { - "feature-dataset-id": featurestore_dataset.id, - "feature-dataset-name-": featurestore_dataset.name + "dataset-id": featurestore_dataset.id, + "dataset-name": featurestore_dataset.name } self.metadata_custom.add("featurestore.dataset", value=str(dataset_details), category=MetadataCustomCategory.TRAINING_AND_VALIDATION_DATASETS, @@ -1982,8 +1981,7 @@ def save( # Add the model id to the feature store dataset if featurestore_dataset: model_details = ModelDetails().with_items([self.model_id]) - featurestore_dataset.with_model_details(model_details, False) - featurestore_dataset.update() + featurestore_dataset.add_models(model_details) return self.model_id From 612672012518dcbfd3b13ce2683d13be38bdbf97 Mon Sep 17 00:00:00 2001 From: najiyacl Date: Mon, 24 Jul 2023 09:43:46 +0530 Subject: [PATCH 06/15] minor changes --- ads/feature_store/dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ads/feature_store/dataset.py b/ads/feature_store/dataset.py index c786d4760..00cf9eac0 100644 --- a/ads/feature_store/dataset.py +++ b/ads/feature_store/dataset.py @@ -529,6 +529,8 @@ def add_models(self, model_details: ModelDetails) -> "Dataset": if model_id not in updated_model_details["items"]: logger.warning(f"Either model with Id '{model_id}' doesnt exist or unable to validate") + return self + def remove_models(self, model_details: ModelDetails) -> "Dataset": """remove model details from the dataset, remove from the existing dataset model id list From 2c6d1f23b37574689f83f40b5fd6e1a6bc135b43 Mon Sep 17 00:00:00 2001 From: najiyacl Date: Mon, 24 Jul 2023 09:51:47 +0530 Subject: [PATCH 07/15] black formatted --- ads/feature_store/common/utils/utility.py | 30 ++++++++++---------- ads/feature_store/dataset.py | 10 ++++--- ads/feature_store/mixin/oci_feature_store.py | 1 - ads/feature_store/validation_output.py | 12 ++++---- 4 files changed, 26 insertions(+), 27 deletions(-) diff --git a/ads/feature_store/common/utils/utility.py b/ads/feature_store/common/utils/utility.py index fac3b9708..1b355bb71 100644 --- a/ads/feature_store/common/utils/utility.py +++ b/ads/feature_store/common/utils/utility.py @@ -49,7 +49,7 @@ def get_execution_engine_type( - data_frame: Union[DataFrame, pd.DataFrame] + data_frame: Union[DataFrame, pd.DataFrame] ) -> ExecutionEngine: """ Determines the execution engine type for a given DataFrame. @@ -89,7 +89,7 @@ def get_metastore_id(feature_store_id: str): def validate_delta_format_parameters( - timestamp: datetime = None, version_number: int = None, is_restore: bool = False + timestamp: datetime = None, version_number: int = None, is_restore: bool = False ): """ Validate the user input provided as part of preview, restore APIs for ingested data, Ingested data is @@ -123,9 +123,9 @@ def validate_delta_format_parameters( def show_ingestion_summary( - entity_id: str, - entity_type: EntityType = EntityType.FEATURE_GROUP, - error_details: str = None, + entity_id: str, + entity_type: EntityType = EntityType.FEATURE_GROUP, + error_details: str = None, ): """ Displays a ingestion summary table with the given entity type and error details. @@ -165,7 +165,7 @@ def show_validation_summary(ingestion_status: str, validation_output, expectatio statistics = validation_output["statistics"] table_headers = ( - ["expectation_type"] + list(statistics.keys()) + ["ingestion_status"] + ["expectation_type"] + list(statistics.keys()) + ["ingestion_status"] ) table_values = [expectation_type] + list(statistics.values()) + [ingestion_status] @@ -209,9 +209,9 @@ def show_validation_summary(ingestion_status: str, validation_output, expectatio def get_features( - output_columns: List[dict], - parent_id: str, - entity_type: EntityType = EntityType.FEATURE_GROUP, + output_columns: List[dict], + parent_id: str, + entity_type: EntityType = EntityType.FEATURE_GROUP, ) -> List[Feature]: """ Returns a list of features, given a list of output_columns and a feature_group_id. @@ -268,7 +268,7 @@ def get_schema_from_spark_df(df: DataFrame): def get_schema_from_df( - data_frame: Union[DataFrame, pd.DataFrame], feature_store_id: str + data_frame: Union[DataFrame, pd.DataFrame], feature_store_id: str ) -> List[dict]: """ Given a DataFrame, returns a list of dictionaries that describe its schema. @@ -282,7 +282,7 @@ def get_schema_from_df( def get_input_features_from_df( - data_frame: Union[DataFrame, pd.DataFrame], feature_store_id: str + data_frame: Union[DataFrame, pd.DataFrame], feature_store_id: str ) -> List[FeatureDetail]: """ Given a DataFrame, returns a list of FeatureDetail objects that represent its input features. @@ -299,7 +299,7 @@ def get_input_features_from_df( def convert_expectation_suite_to_expectation( - expectation_suite: ExpectationSuite, expectation_type: ExpectationType + expectation_suite: ExpectationSuite, expectation_type: ExpectationType ): """ Convert an ExpectationSuite object to an Expectation object with detailed rule information. @@ -358,7 +358,7 @@ def largest_matching_subset_of_primary_keys(left_feature_group, right_feature_gr def convert_pandas_datatype_with_schema( - raw_feature_details: List[dict], input_df: pd.DataFrame + raw_feature_details: List[dict], input_df: pd.DataFrame ) -> pd.DataFrame: feature_detail_map = {} columns_to_remove = [] @@ -383,7 +383,7 @@ def convert_pandas_datatype_with_schema( def convert_spark_dataframe_with_schema( - raw_feature_details: List[dict], input_df: DataFrame + raw_feature_details: List[dict], input_df: DataFrame ) -> DataFrame: feature_detail_map = {} columns_to_remove = [] @@ -402,4 +402,4 @@ def convert_spark_dataframe_with_schema( def validate_input_feature_details(input_feature_details, data_frame): if isinstance(data_frame, pd.DataFrame): return convert_pandas_datatype_with_schema(input_feature_details, data_frame) - return convert_spark_dataframe_with_schema(input_feature_details, data_frame) \ No newline at end of file + return convert_spark_dataframe_with_schema(input_feature_details, data_frame) diff --git a/ads/feature_store/dataset.py b/ads/feature_store/dataset.py index 00cf9eac0..67c451ab0 100644 --- a/ads/feature_store/dataset.py +++ b/ads/feature_store/dataset.py @@ -18,7 +18,7 @@ from ads.feature_store.common.utils.utility import ( get_metastore_id, validate_delta_format_parameters, - convert_expectation_suite_to_expectation + convert_expectation_suite_to_expectation, ) from ads.feature_store.dataset_job import DatasetJob, IngestionMode from ads.feature_store.execution_strategy.engine.spark_engine import SparkEngine @@ -374,7 +374,7 @@ def expectation_details(self) -> "Expectation": return self.get_spec(self.CONST_EXPECTATION_DETAILS) def with_expectation_suite( - self, expectation_suite: ExpectationSuite, expectation_type: ExpectationType + self, expectation_suite: ExpectationSuite, expectation_type: ExpectationType ) -> "Dataset": """Sets the expectation details for the feature group. @@ -445,7 +445,7 @@ def statistics_config(self, statistics_config: StatisticsConfig): self.with_statistics_config(statistics_config) def with_statistics_config( - self, statistics_config: Union[StatisticsConfig, bool] + self, statistics_config: Union[StatisticsConfig, bool] ) -> "Dataset": """Sets the statistics details for the dataset. @@ -527,7 +527,9 @@ def add_models(self, model_details: ModelDetails) -> "Dataset": if updated_model_details and updated_model_details.items and model_details: for model_id in model_details.items: if model_id not in updated_model_details["items"]: - logger.warning(f"Either model with Id '{model_id}' doesnt exist or unable to validate") + logger.warning( + f"Either model with Id '{model_id}' doesnt exist or unable to validate" + ) return self diff --git a/ads/feature_store/mixin/oci_feature_store.py b/ads/feature_store/mixin/oci_feature_store.py index 0914de19a..813d36017 100644 --- a/ads/feature_store/mixin/oci_feature_store.py +++ b/ads/feature_store/mixin/oci_feature_store.py @@ -14,7 +14,6 @@ class OCIFeatureStoreMixin(OCIModelMixin): def init_client( cls, **kwargs ) -> oci.feature_store.feature_store_client.FeatureStoreClient: - # TODO: Getting the endpoint from authorizer fs_service_endpoint = os.environ.get("OCI_FS_SERVICE_ENDPOINT") if fs_service_endpoint: diff --git a/ads/feature_store/validation_output.py b/ads/feature_store/validation_output.py index 13f793646..2ceb17ef9 100644 --- a/ads/feature_store/validation_output.py +++ b/ads/feature_store/validation_output.py @@ -23,10 +23,10 @@ def to_pandas(self) -> pd.DataFrame: The validation output information as a pandas DataFrame. """ if self.content: - validation_output_json = ( - json.loads(self.content) - ) - profile_result = pd.json_normalize(validation_output_json.get("results")).transpose() + validation_output_json = json.loads(self.content) + profile_result = pd.json_normalize( + validation_output_json.get("results") + ).transpose() return profile_result def to_summary(self) -> pd.DataFrame: @@ -39,9 +39,7 @@ def to_summary(self) -> pd.DataFrame: The validation output summary information as a pandas DataFrame. """ if self.content: - validation_output_json = ( - json.loads(self.content) - ) + validation_output_json = json.loads(self.content) profile_result = pd.json_normalize(validation_output_json).transpose() summary_df = profile_result.drop("results") return summary_df From 3e5a4b77e2b4a45beeca622bb6f1842dc20eee4d Mon Sep 17 00:00:00 2001 From: najiyacl Date: Mon, 24 Jul 2023 09:55:11 +0530 Subject: [PATCH 08/15] setting the service endpoint via env variable --- tests/integration/feature_store/test_base.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/integration/feature_store/test_base.py b/tests/integration/feature_store/test_base.py index 53a9d5058..429c03b15 100644 --- a/tests/integration/feature_store/test_base.py +++ b/tests/integration/feature_store/test_base.py @@ -20,11 +20,8 @@ from ads.feature_store.statistics_config import StatisticsConfig -client_kwargs = dict( - retry_strategy=oci.retry.NoneRetryStrategy, - service_endpoint=os.getenv("service_endpoint"), -) -ads.set_auth(client_kwargs=client_kwargs) +ads.set_auth() +os.environ["OCI_FS_SERVICE_ENDPOINT"] = os.getenv("service_endpoint") try: from ads.feature_store.feature_store import FeatureStore @@ -37,7 +34,9 @@ class FeatureStoreTestCase: # networks compartment in feature store - TIME_NOW = str.format("{}_{}",datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S"),int(random()*1000)) + TIME_NOW = str.format( + "{}_{}", datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S"), int(random() * 1000) + ) TENANCY_ID = "ocid1.tenancy.oc1..aaaaaaaa462hfhplpx652b32ix62xrdijppq2c7okwcqjlgrbknhgtj2kofa" COMPARTMENT_ID = "ocid1.tenancy.oc1..aaaaaaaa462hfhplpx652b32ix62xrdijppq2c7okwcqjlgrbknhgtj2kofa" METASTORE_ID = "ocid1.datacatalogmetastore.oc1.iad.amaaaaaabiudgxyap7tizm4gscwz7amu7dixz7ml3mtesqzzwwg3urvvdgua" From 6d523ba39f19eab3347eb02dcf1a5d6ac31bb07b Mon Sep 17 00:00:00 2001 From: najiyacl Date: Mon, 24 Jul 2023 15:43:48 +0530 Subject: [PATCH 09/15] test updated --- ads/feature_store/common/utils/utility.py | 1 - ads/feature_store/dataset.py | 1 - tests/integration/feature_store/test_dataset_validations.py | 5 ++--- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/ads/feature_store/common/utils/utility.py b/ads/feature_store/common/utils/utility.py index 1b355bb71..b93ace259 100644 --- a/ads/feature_store/common/utils/utility.py +++ b/ads/feature_store/common/utils/utility.py @@ -12,7 +12,6 @@ from great_expectations.core import ExpectationSuite from ads.common.decorator.runtime_dependency import OptionalDependency -from ads.common.oci_resource import OCIResource, SEARCH_TYPE from ads.feature_store.common.utils.feature_schema_mapper import ( map_spark_type_to_feature_type, map_feature_type_to_pandas, diff --git a/ads/feature_store/dataset.py b/ads/feature_store/dataset.py index 67c451ab0..085d78ef3 100644 --- a/ads/feature_store/dataset.py +++ b/ads/feature_store/dataset.py @@ -487,7 +487,6 @@ def with_model_details(self, model_details: ModelDetails) -> "Dataset": Parameters ---------- model_details: ModelDetails - validate: if to validate the model ids Returns ------- diff --git a/tests/integration/feature_store/test_dataset_validations.py b/tests/integration/feature_store/test_dataset_validations.py index bc5cc8217..990e38ef9 100644 --- a/tests/integration/feature_store/test_dataset_validations.py +++ b/tests/integration/feature_store/test_dataset_validations.py @@ -91,9 +91,8 @@ def test_dataset_model_details(self): assert dataset.oci_dataset.id dataset.materialise() - updated_dataset = dataset.add_models(ModelDetails().with_items(["model_ocid"])) - updated_dataset.show() - assert updated_dataset.model_details is not None + updated_dataset = dataset.add_models(ModelDetails().with_items(["model_ocid_invalid"])) + assert len(updated_dataset.model_details.get("items")) == 0 self.clean_up_dataset(dataset) self.clean_up_feature_group(fg) self.clean_up_entity(entity) From 3f5290373ae4a71bec8bdb547b5e5e006f85fc98 Mon Sep 17 00:00:00 2001 From: najiyacl Date: Mon, 24 Jul 2023 15:45:26 +0530 Subject: [PATCH 10/15] remove unused import --- ads/feature_store/common/utils/utility.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ads/feature_store/common/utils/utility.py b/ads/feature_store/common/utils/utility.py index b93ace259..e7fae76e5 100644 --- a/ads/feature_store/common/utils/utility.py +++ b/ads/feature_store/common/utils/utility.py @@ -8,7 +8,6 @@ from typing import Union, List -import oci.regions from great_expectations.core import ExpectationSuite from ads.common.decorator.runtime_dependency import OptionalDependency From 102852034191609a40b9562bdd9d08134dd0d08c Mon Sep 17 00:00:00 2001 From: Kshitiz Lohia Date: Tue, 25 Jul 2023 16:06:41 +0530 Subject: [PATCH 11/15] added notebook for schema evolution --- ads/feature_store/docs/source/notebook.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ads/feature_store/docs/source/notebook.rst b/ads/feature_store/docs/source/notebook.rst index a931a1b14..5024cb9fa 100644 --- a/ads/feature_store/docs/source/notebook.rst +++ b/ads/feature_store/docs/source/notebook.rst @@ -24,3 +24,8 @@ Notebook Examples - `Big data operations with feature store `__ - | 1. Ingestion of data using spark magic | 2. Querying and exploration of data using spark magic + + * - `Schema enforcement and schema evolution `__ + - `Schema enforcement and schema evolution `__ + - | 1. Schema evolution is a feature that allows users to easily change a table's current schema to accommodate data that is changing over time. + | 2. Schema enforcement, also known as schema validation, is a safeguard in Delta Lake that ensures data quality by rejecting writes to a table that do not match the table's schema. From 962d0ed3a7d921e9b3ca1bca25ee21cede2a7954 Mon Sep 17 00:00:00 2001 From: najiyacl Date: Fri, 28 Jul 2023 09:44:36 +0530 Subject: [PATCH 12/15] review comments --- ads/feature_store/dataset.py | 37 +++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/ads/feature_store/dataset.py b/ads/feature_store/dataset.py index 085d78ef3..981023d04 100644 --- a/ads/feature_store/dataset.py +++ b/ads/feature_store/dataset.py @@ -517,20 +517,31 @@ def add_models(self, model_details: ModelDetails) -> "Dataset": if existing_model_details and existing_model_details.items: items = existing_model_details["items"] for item in items: - model_details.items.append(item) + if item not in model_details.items: + model_details.items.append(item) self.with_model_details(model_details) - self.update() - - self._update_from_oci_dataset_model(self.oci_dataset) - updated_model_details = self.model_details - if updated_model_details and updated_model_details.items and model_details: - for model_id in model_details.items: - if model_id not in updated_model_details["items"]: - logger.warning( - f"Either model with Id '{model_id}' doesnt exist or unable to validate" - ) - - return self + try: + return self.update() + except Exception as ex: + logger.error( + f"Dataset update Failed with : {type(ex)} with error message: {ex}" + ) + if existing_model_details: + self.with_model_details(ModelDetails().with_items(existing_model_details["items"])) + else: + self.with_model_details(ModelDetails().with_items([])) + + # self._update_from_oci_dataset_model(self.oci_dataset) + # updated_model_details = self.model_details + # if updated_model_details and updated_model_details.items and model_details: + # for model_id in model_details.items: + # if model_id not in updated_model_details["items"]: + # logger.warning( + # f"Either model with Id '{model_id}' doesnt exist or unable to validate" + # ) + # ) + # + # return self def remove_models(self, model_details: ModelDetails) -> "Dataset": """remove model details from the dataset, remove from the existing dataset model id list From 2f725f65de50666c28cdadabb300652f0a9aee35 Mon Sep 17 00:00:00 2001 From: najiyacl Date: Mon, 31 Jul 2023 12:50:12 +0530 Subject: [PATCH 13/15] Doc Update --- ads/feature_store/dataset.py | 1 + ads/feature_store/docs/source/dataset.rst | 13 +++++++++++++ ads/feature_store/docs/source/quickstart.rst | 3 ++- .../feature_store/test_dataset_validations.py | 4 ++-- 4 files changed, 18 insertions(+), 3 deletions(-) diff --git a/ads/feature_store/dataset.py b/ads/feature_store/dataset.py index 981023d04..f6c8b990f 100644 --- a/ads/feature_store/dataset.py +++ b/ads/feature_store/dataset.py @@ -530,6 +530,7 @@ def add_models(self, model_details: ModelDetails) -> "Dataset": self.with_model_details(ModelDetails().with_items(existing_model_details["items"])) else: self.with_model_details(ModelDetails().with_items([])) + return self # self._update_from_oci_dataset_model(self.oci_dataset) # updated_model_details = self.model_details diff --git a/ads/feature_store/docs/source/dataset.rst b/ads/feature_store/docs/source/dataset.rst index 158c2e3f1..13c09c49d 100644 --- a/ads/feature_store/docs/source/dataset.rst +++ b/ads/feature_store/docs/source/dataset.rst @@ -317,3 +317,16 @@ Below is an example of the output. .. figure:: figures/dataset_lineage.png :width: 400 + + +Add Model Details +================= + +You can call the ``add_models()`` method of the Dataset instance to add model ids to dataset. +The ``.add_models()`` method takes the following parameter: +``ModelDetails()`` with which we can associate model ids. + + +.. code-block:: python3 + + dataset.add_models(ModelDetails().with_items([])) diff --git a/ads/feature_store/docs/source/quickstart.rst b/ads/feature_store/docs/source/quickstart.rst index 7baffbde4..4b3655cf7 100644 --- a/ads/feature_store/docs/source/quickstart.rst +++ b/ads/feature_store/docs/source/quickstart.rst @@ -55,8 +55,9 @@ Background reading to understand the concepts of Feature Store and OCI Data Scie compartment_id = "ocid1.compartment." metastore_id = "ocid1.datacatalogmetastore.oc1.iad." api_gateway_endpoint = "https://**.{region}.oci.customer-oci.com/20230101" + os.environ["OCI_FS_SERVICE_ENDPOINT"] = api_gateway_endpoint - ads.set_auth(auth="user_principal", client_kwargs={"service_endpoint": api_gateway_endpoint}) + ads.set_auth(auth="api_key") # step1: Create feature store feature_store_resource = ( diff --git a/tests/integration/feature_store/test_dataset_validations.py b/tests/integration/feature_store/test_dataset_validations.py index 990e38ef9..375a15be0 100644 --- a/tests/integration/feature_store/test_dataset_validations.py +++ b/tests/integration/feature_store/test_dataset_validations.py @@ -91,8 +91,8 @@ def test_dataset_model_details(self): assert dataset.oci_dataset.id dataset.materialise() - updated_dataset = dataset.add_models(ModelDetails().with_items(["model_ocid_invalid"])) - assert len(updated_dataset.model_details.get("items")) == 0 + dataset.add_models(ModelDetails().with_items(["model_ocid_invalid"])) + assert len(dataset.model_details.get("items")) == 0 self.clean_up_dataset(dataset) self.clean_up_feature_group(fg) self.clean_up_entity(entity) From 1772caf5d09f927f1ebcfbb4bf50752f7efc651c Mon Sep 17 00:00:00 2001 From: najiyacl Date: Mon, 31 Jul 2023 16:21:14 +0530 Subject: [PATCH 14/15] removing commented code --- ads/feature_store/dataset.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/ads/feature_store/dataset.py b/ads/feature_store/dataset.py index f6c8b990f..870f8fa63 100644 --- a/ads/feature_store/dataset.py +++ b/ads/feature_store/dataset.py @@ -532,18 +532,6 @@ def add_models(self, model_details: ModelDetails) -> "Dataset": self.with_model_details(ModelDetails().with_items([])) return self - # self._update_from_oci_dataset_model(self.oci_dataset) - # updated_model_details = self.model_details - # if updated_model_details and updated_model_details.items and model_details: - # for model_id in model_details.items: - # if model_id not in updated_model_details["items"]: - # logger.warning( - # f"Either model with Id '{model_id}' doesnt exist or unable to validate" - # ) - # ) - # - # return self - def remove_models(self, model_details: ModelDetails) -> "Dataset": """remove model details from the dataset, remove from the existing dataset model id list From 58bfaa9f790f5566ff387050cfed48db229cf5d1 Mon Sep 17 00:00:00 2001 From: najiyacl Date: Mon, 31 Jul 2023 18:00:51 +0530 Subject: [PATCH 15/15] doc update - minor --- ads/feature_store/docs/source/dataset.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ads/feature_store/docs/source/dataset.rst b/ads/feature_store/docs/source/dataset.rst index 13c09c49d..5275fe20a 100644 --- a/ads/feature_store/docs/source/dataset.rst +++ b/ads/feature_store/docs/source/dataset.rst @@ -306,7 +306,7 @@ Use the ``show()`` method on the ``Dataset`` instance to visualize the lineage o The ``show()`` method takes the following optional parameter: - - ``rankdir: (str, optional)``. Defaults to ``LR``. The allowed values are ``TB`` or ``LR``. This parameter is applicable only for ``graph`` mode and it renders the direction of the graph as either top to bottom (TB) or left to right (LR). +- ``rankdir: (str, optional)``. Defaults to ``LR``. The allowed values are ``TB`` or ``LR``. This parameter is applicable only for ``graph`` mode and it renders the direction of the graph as either top to bottom (TB) or left to right (LR). .. code-block:: python3 @@ -324,8 +324,8 @@ Add Model Details You can call the ``add_models()`` method of the Dataset instance to add model ids to dataset. The ``.add_models()`` method takes the following parameter: -``ModelDetails()`` with which we can associate model ids. +- ``model_details: ModelDetails``. ModelDetails takes ``items: List[str]`` as parameter and model ids to be passed as items. .. code-block:: python3