Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

stats representation via plotly in to_viz interface #335

Merged
merged 13 commits into from
Sep 21, 2023
21 changes: 11 additions & 10 deletions ads/feature_store/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
from ads.feature_store.feature_group_expectation import Expectation
from ads.feature_store.feature_option_details import FeatureOptionDetails
from ads.feature_store.service.oci_dataset import OCIDataset
from ads.feature_store.statistics import Statistics
from ads.feature_store.statistics.statistics import Statistics
from ads.feature_store.statistics_config import StatisticsConfig
from ads.feature_store.service.oci_lineage import OCILineage
from ads.feature_store.model_details import ModelDetails
Expand Down Expand Up @@ -164,7 +164,7 @@ def __init__(self, spec: Dict = None, **kwargs) -> None:
self.oci_dataset = self._to_oci_dataset(**kwargs)
self.lineage = OCILineage(**kwargs)

def _to_oci_dataset(self, **kwargs):
def _to_oci_dataset(self, **kwargs) -> OCIDataset:
"""Creates an `OCIDataset` instance from the `Dataset`.

kwargs
Expand Down Expand Up @@ -235,8 +235,8 @@ def name(self) -> str:
return self.get_spec(self.CONST_NAME)

@name.setter
def name(self, name: str) -> "Dataset":
return self.with_name(name)
def name(self, name: str):
self.with_name(name)

def with_name(self, name: str) -> "Dataset":
"""Sets the name.
Expand Down Expand Up @@ -866,9 +866,8 @@ def _update_from_oci_dataset_model(self, oci_dataset: OCIDataset) -> "Dataset":

value = {self.CONST_ITEMS: features_list}
else:
value = getattr(self.oci_dataset, dsc_attr)
value = dataset_details[infra_attr]
self.set_spec(infra_attr, value)

return self

def materialise(
Expand Down Expand Up @@ -1206,12 +1205,14 @@ def to_dict(self) -> Dict:
for key, value in spec.items():
if hasattr(value, "to_dict"):
value = value.to_dict()
if hasattr(value, "attribute_map"):
value = self.oci_dataset.client.base_client.sanitize_for_serialization(
if key == self.CONST_FEATURE_GROUP:
spec[
key
] = self.oci_dataset.client.base_client.sanitize_for_serialization(
value
)
spec[key] = value

else:
spec[key] = value
return {
"kind": self.kind,
"type": self.type,
Expand Down
20 changes: 12 additions & 8 deletions ads/feature_store/execution_strategy/spark/spark_execution.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8; -*-
import json

# Copyright (c) 2023 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

Expand Down Expand Up @@ -29,8 +27,6 @@
raise

from ads.feature_store.common.enums import (
FeatureStoreJobType,
LifecycleState,
EntityType,
ExpectationType,
)
Expand All @@ -47,6 +43,11 @@

from ads.feature_store.feature_statistics.statistics_service import StatisticsService
from ads.feature_store.common.utils.utility import validate_input_feature_details
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from ads.feature_store.feature_group import FeatureGroup
from ads.feature_store.dataset import Dataset

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -76,7 +77,10 @@ def __init__(self, metastore_id: str = None):
self._jvm = self._spark_context._jvm

def ingest_feature_definition(
self, feature_group, feature_group_job: FeatureGroupJob, dataframe
self,
feature_group: "FeatureGroup",
feature_group_job: FeatureGroupJob,
dataframe,
):
try:
self._save_offline_dataframe(dataframe, feature_group, feature_group_job)
Expand All @@ -90,7 +94,7 @@ def ingest_dataset(self, dataset, dataset_job: DatasetJob):
raise SparkExecutionException(e).with_traceback(e.__traceback__)

def delete_feature_definition(
self, feature_group, feature_group_job: FeatureGroupJob
self, feature_group: "FeatureGroup", feature_group_job: FeatureGroupJob
):
"""
Deletes a feature definition from the system.
Expand Down Expand Up @@ -122,7 +126,7 @@ def delete_feature_definition(
output_details=output_details,
)

def delete_dataset(self, dataset, dataset_job: DatasetJob):
def delete_dataset(self, dataset: "Dataset", dataset_job: DatasetJob):
"""
Deletes a dataset from the system.

Expand Down Expand Up @@ -154,7 +158,7 @@ def delete_dataset(self, dataset, dataset_job: DatasetJob):
)

@staticmethod
def _validate_expectation(expectation_type, validation_output):
def _validate_expectation(expectation_type, validation_output: dict):
"""
Validates the expectation based on the given expectation type and the validation output.

Expand Down
10 changes: 4 additions & 6 deletions ads/feature_store/feature_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
from ads.feature_store.service.oci_feature_group import OCIFeatureGroup
from ads.feature_store.service.oci_feature_group_job import OCIFeatureGroupJob
from ads.feature_store.service.oci_lineage import OCILineage
from ads.feature_store.statistics import Statistics
from ads.feature_store.statistics.statistics import Statistics
from ads.feature_store.statistics_config import StatisticsConfig
from ads.feature_store.validation_output import ValidationOutput

Expand Down Expand Up @@ -244,8 +244,8 @@ def name(self) -> str:
return self.get_spec(self.CONST_NAME)

@name.setter
def name(self, name: str) -> "FeatureGroup":
return self.with_name(name)
def name(self, name: str):
self.with_name(name)

def with_name(self, name: str) -> "FeatureGroup":
"""Sets the name.
Expand Down Expand Up @@ -338,7 +338,7 @@ def transformation_kwargs(self, value: Dict):
self.with_transformation_kwargs(value)

def with_transformation_kwargs(
self, transformation_kwargs: Dict = {}
self, transformation_kwargs: Dict = ()
) -> "FeatureGroup":
"""Sets the primary keys of the feature group.

Expand Down Expand Up @@ -604,7 +604,6 @@ def with_statistics_config(
FeatureGroup
The FeatureGroup instance (self).
"""
statistics_config_in = None
if isinstance(statistics_config, StatisticsConfig):
statistics_config_in = statistics_config
elif isinstance(statistics_config, bool):
Expand Down Expand Up @@ -1108,7 +1107,6 @@ def restore(self, version_number: int = None, timestamp: datetime = None):
f"RESTORE TABLE {target_table} TO VERSION AS OF {version_number}"
)
else:
iso_timestamp = timestamp.isoformat(" ", "seconds").__str__()
sql_query = f"RESTORE TABLE {target_table} TO TIMESTAMP AS OF {timestamp}"

restore_output = self.spark_engine.sql(sql_query)
Expand Down
25 changes: 0 additions & 25 deletions ads/feature_store/statistics.py

This file was deleted.

Empty file.
Empty file.
59 changes: 59 additions & 0 deletions ads/feature_store/statistics/charts/abstract_feature_stat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/usr/bin/env python
# -*- coding: utf-8; -*-
# Copyright (c) 2023 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
from abc import abstractmethod
from typing import Union

from ads.common.decorator.runtime_dependency import OptionalDependency

try:
from plotly.graph_objs import Figure
except ModuleNotFoundError:
raise ModuleNotFoundError(
f"The `plotly` module was not found. Please run `pip install "
f"{OptionalDependency.FEATURE_STORE}`."
)


class AbsFeatureStat:
class ValidationFailedException(Exception):
def __init__(self):
pass

def __init__(self):
self.__validate__()

@abstractmethod
def __validate__(self):
pass

@abstractmethod
def add_to_figure(self, fig: Figure, xaxis: int, yaxis: int):
pass

@classmethod
@abstractmethod
def __from_json__(cls, json_dict: dict):
pass

@staticmethod
def get_x_y_str_axes(xaxis: int, yaxis: int) -> ():
return (
("xaxis" + str(xaxis + 1)),
("yaxis" + str(yaxis + 1)),
("x" + str(xaxis + 1)),
("y" + str(yaxis + 1)),
)

@classmethod
def from_json(
cls, json_dict: dict, ignore_errors: bool = False
) -> Union["AbsFeatureStat", None]:
try:
return cls.__from_json__(json_dict=json_dict)
except Exception as e:
if ignore_errors:
return None
else:
raise e
120 changes: 120 additions & 0 deletions ads/feature_store/statistics/charts/box_plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#!/usr/bin/env python
# -*- coding: utf-8; -*-
# Copyright (c) 2023 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
from typing import List

from ads.common.decorator.runtime_dependency import OptionalDependency
from ads.feature_store.statistics.charts.abstract_feature_stat import AbsFeatureStat
from ads.feature_store.statistics.charts.frequency_distribution import (
FrequencyDistribution,
)
from ads.feature_store.statistics.generic_feature_value import GenericFeatureValue

try:
from plotly.graph_objs import Figure
except ModuleNotFoundError:
raise ModuleNotFoundError(
f"The `plotly` module was not found. Please run `pip install "
f"{OptionalDependency.FEATURE_STORE}`."
)


class BoxPlot(AbsFeatureStat):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thank you for adding box plot also 👍

CONST_MIN = "Min"
CONST_MAX = "Max"
CONST_QUARTILES = "Quartiles"
CONST_SD = "StandardDeviation"
CONST_MEAN = "Mean"
CONST_BOX_PLOT_TITLE = "Box Plot"
CONST_IQR = "IQR"
CONST_BOX_POINTS = "box_points"

class Quartiles:
CONST_Q1 = "q1"
CONST_Q2 = "q2"
CONST_Q3 = "q3"

def __init__(self, q1: float, q2: float, q3: float):
self.q1 = q1
self.q2 = q2
self.q3 = q3

@classmethod
def from_json(cls, json_dict: dict) -> "BoxPlot.Quartiles":
return cls(
json_dict.get(cls.CONST_Q1),
json_dict.get(cls.CONST_Q2),
json_dict.get(cls.CONST_Q3),
)

def __init__(
self,
mean: float,
median: float,
sd: float,
q1: float,
q3: float,
box_points: List[float],
):
self.mean = mean
self.median = median
self.q1 = q1
self.q3 = q3
self.sd = sd
self.iqr = self.q3 - self.q1
self.box_points = box_points
super().__init__()

def add_to_figure(self, fig: Figure, xaxis: int, yaxis: int):
xaxis_str, yaxis_str, x_str, y_str = self.get_x_y_str_axes(xaxis, yaxis)
fig.add_box(
notched=False,
boxmean=False,
mean=[self.mean],
median=[self.median],
q1=[self.q1],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could there be scenarios where quartiles are not populated by the mlm library?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is true.....for that we have implmented validations.

q3=[self.q3],
sd=[self.sd],
y=[self.box_points],
upperfence=[self.q3 + 1.5 * self.iqr],
lowerfence=[self.q1 - 1.5 * self.iqr],
xaxis=x_str,
yaxis=y_str,
name="",
jitter=0,
)
fig.layout.annotations[xaxis].text = self.CONST_BOX_PLOT_TITLE
fig.layout[yaxis_str]["title"] = "Values"

@staticmethod
def get_box_points_from_frequency_distribution(
frequency_distribution: FrequencyDistribution,
) -> List[float]:
# box_points = []
if (
frequency_distribution is not None
and frequency_distribution.frequency is not None
and frequency_distribution.bins is not None
):
return [
bin_dist
for frequency, bin_dist in zip(
frequency_distribution.frequency, frequency_distribution.bins
)
if frequency > 0
]
else:
return []

@classmethod
def __from_json__(cls, json_dict: dict) -> "BoxPlot":
quartiles = cls.Quartiles.from_json(json_dict.get(cls.CONST_QUARTILES))
return cls(
mean=GenericFeatureValue.from_json(json_dict.get(cls.CONST_MEAN)).val,
median=quartiles.q2,
sd=GenericFeatureValue.from_json(json_dict.get(cls.CONST_SD)).val,
q1=quartiles.q1,
q3=quartiles.q3,
box_points=json_dict.get(cls.CONST_BOX_POINTS),
)
Loading