diff --git a/THIRD_PARTY_LICENSES.txt b/THIRD_PARTY_LICENSES.txt index 404fedace..d81f2bdd5 100644 --- a/THIRD_PARTY_LICENSES.txt +++ b/THIRD_PARTY_LICENSES.txt @@ -229,6 +229,12 @@ pandavro * Source code: https://github.com/ynqa/pandavro * Project home: https://github.com/ynqa/pandavro +plotly +* Copyright (c) 2016-2018 Plotly, Inc +* License: MIT License +* Source code: https://github.com/plotly/plotly.py +* Project home: https://plotly.com/ + protobuf * Copyright 2008 Google Inc. All rights reserved. * License: Google Protobuf License diff --git a/ads/feature_store/dataset.py b/ads/feature_store/dataset.py index 4275d3e37..e505a8a76 100644 --- a/ads/feature_store/dataset.py +++ b/ads/feature_store/dataset.py @@ -38,7 +38,7 @@ from ads.feature_store.feature_group_expectation import Expectation from ads.feature_store.feature_option_details import FeatureOptionDetails from ads.feature_store.service.oci_dataset import OCIDataset -from ads.feature_store.statistics import Statistics +from ads.feature_store.statistics.statistics import Statistics from ads.feature_store.statistics_config import StatisticsConfig from ads.feature_store.service.oci_lineage import OCILineage from ads.feature_store.model_details import ModelDetails @@ -164,7 +164,7 @@ def __init__(self, spec: Dict = None, **kwargs) -> None: self.oci_dataset = self._to_oci_dataset(**kwargs) self.lineage = OCILineage(**kwargs) - def _to_oci_dataset(self, **kwargs): + def _to_oci_dataset(self, **kwargs) -> OCIDataset: """Creates an `OCIDataset` instance from the `Dataset`. kwargs @@ -235,8 +235,8 @@ def name(self) -> str: return self.get_spec(self.CONST_NAME) @name.setter - def name(self, name: str) -> "Dataset": - return self.with_name(name) + def name(self, name: str): + self.with_name(name) def with_name(self, name: str) -> "Dataset": """Sets the name. @@ -1207,12 +1207,14 @@ def to_dict(self) -> Dict: for key, value in spec.items(): if hasattr(value, "to_dict"): value = value.to_dict() - if hasattr(value, "attribute_map"): - value = self.oci_dataset.client.base_client.sanitize_for_serialization( + if key == self.CONST_FEATURE_GROUP: + spec[ + key + ] = self.oci_dataset.client.base_client.sanitize_for_serialization( value ) - spec[key] = value - + else: + spec[key] = value return { "kind": self.kind, "type": self.type, diff --git a/ads/feature_store/docs/Makefile b/ads/feature_store/docs/Makefile index f93e7ea53..36284073e 100644 --- a/ads/feature_store/docs/Makefile +++ b/ads/feature_store/docs/Makefile @@ -18,7 +18,7 @@ help: .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +# "make mode" option. $(O) is me`ant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/ads/feature_store/docs/source/dataset.rst b/ads/feature_store/docs/source/dataset.rst index 42f65f4fa..0fe9700af 100644 --- a/ads/feature_store/docs/source/dataset.rst +++ b/ads/feature_store/docs/source/dataset.rst @@ -220,6 +220,14 @@ The ``get_statistics()`` method takes the following optional parameter: .. image:: figures/dataset_statistics.png +.. code-block:: python3 + + # Fetch and visualize stats for a dataset job + df = dataset.get_statistics(job_id).to_viz() + +.. image:: figures/dataset_statistics_viz.png + + .. seealso:: :ref:`Statistics` diff --git a/ads/feature_store/docs/source/feature_group.rst b/ads/feature_store/docs/source/feature_group.rst index c1d05fe99..860280336 100644 --- a/ads/feature_store/docs/source/feature_group.rst +++ b/ads/feature_store/docs/source/feature_group.rst @@ -254,6 +254,13 @@ The ``get_statistics()`` method takes the following optional parameter: .. image:: figures/stats_1.png +.. code-block:: python3 + + # Fetch and visualize stats for a dataset job + df = feature_group.get_statistics(job_id).to_viz() + +.. image:: figures/feature_group_statistics_viz.png + .. seealso:: :ref:`Statistics` diff --git a/ads/feature_store/docs/source/figures/dataset_statistics_viz.png b/ads/feature_store/docs/source/figures/dataset_statistics_viz.png new file mode 100644 index 000000000..3f79725e2 Binary files /dev/null and b/ads/feature_store/docs/source/figures/dataset_statistics_viz.png differ diff --git a/ads/feature_store/docs/source/figures/feature_group_statistics_viz.png b/ads/feature_store/docs/source/figures/feature_group_statistics_viz.png new file mode 100644 index 000000000..29cc74780 Binary files /dev/null and b/ads/feature_store/docs/source/figures/feature_group_statistics_viz.png differ diff --git a/ads/feature_store/feature_group.py b/ads/feature_store/feature_group.py index 888c7cafb..c11fc34a9 100644 --- a/ads/feature_store/feature_group.py +++ b/ads/feature_store/feature_group.py @@ -44,7 +44,7 @@ from ads.feature_store.service.oci_feature_group import OCIFeatureGroup from ads.feature_store.service.oci_feature_group_job import OCIFeatureGroupJob from ads.feature_store.service.oci_lineage import OCILineage -from ads.feature_store.statistics import Statistics +from ads.feature_store.statistics.statistics import Statistics from ads.feature_store.statistics_config import StatisticsConfig from ads.feature_store.validation_output import ValidationOutput @@ -244,8 +244,8 @@ def name(self) -> str: return self.get_spec(self.CONST_NAME) @name.setter - def name(self, name: str) -> "FeatureGroup": - return self.with_name(name) + def name(self, name: str): + self.with_name(name) def with_name(self, name: str) -> "FeatureGroup": """Sets the name. @@ -338,7 +338,7 @@ def transformation_kwargs(self, value: Dict): self.with_transformation_kwargs(value) def with_transformation_kwargs( - self, transformation_kwargs: Dict = {} + self, transformation_kwargs: Dict = () ) -> "FeatureGroup": """Sets the primary keys of the feature group. @@ -604,7 +604,6 @@ def with_statistics_config( FeatureGroup The FeatureGroup instance (self). """ - statistics_config_in = None if isinstance(statistics_config, StatisticsConfig): statistics_config_in = statistics_config elif isinstance(statistics_config, bool): @@ -1108,7 +1107,6 @@ def restore(self, version_number: int = None, timestamp: datetime = None): f"RESTORE TABLE {target_table} TO VERSION AS OF {version_number}" ) else: - iso_timestamp = timestamp.isoformat(" ", "seconds").__str__() sql_query = f"RESTORE TABLE {target_table} TO TIMESTAMP AS OF {timestamp}" restore_output = self.spark_engine.sql(sql_query) diff --git a/ads/feature_store/statistics.py b/ads/feature_store/statistics.py deleted file mode 100644 index 2ce0589c7..000000000 --- a/ads/feature_store/statistics.py +++ /dev/null @@ -1,25 +0,0 @@ -import pandas as pd -from typing import Dict -from copy import deepcopy - -from ads.feature_store.response.response_builder import ResponseBuilder -from ads.jobs.builders.base import Builder -from ads.common import utils - - -class Statistics(ResponseBuilder): - """ - Represents statistical information. - """ - - @property - def kind(self) -> str: - """ - Gets the kind of the statistics object. - - Returns - ------- - str - The kind of the statistics object, which is always "statistics". - """ - return "statistics" diff --git a/ads/feature_store/statistics/__init__.py b/ads/feature_store/statistics/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ads/feature_store/statistics/charts/__init__.py b/ads/feature_store/statistics/charts/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ads/feature_store/statistics/charts/abstract_feature_stat.py b/ads/feature_store/statistics/charts/abstract_feature_stat.py new file mode 100644 index 000000000..814929b80 --- /dev/null +++ b/ads/feature_store/statistics/charts/abstract_feature_stat.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +from abc import abstractmethod +from typing import Union + +from ads.common.decorator.runtime_dependency import OptionalDependency + +try: + from plotly.graph_objs import Figure +except ModuleNotFoundError: + raise ModuleNotFoundError( + f"The `plotly` module was not found. Please run `pip install " + f"{OptionalDependency.FEATURE_STORE}`." + ) + + +class AbsFeatureStat: + class ValidationFailedException(Exception): + def __init__(self): + pass + + def __init__(self): + self.__validate__() + + @abstractmethod + def __validate__(self): + pass + + @abstractmethod + def add_to_figure(self, fig: Figure, xaxis: int, yaxis: int): + pass + + @classmethod + @abstractmethod + def __from_json__(cls, json_dict: dict): + pass + + @staticmethod + def get_x_y_str_axes(xaxis: int, yaxis: int) -> (): + return ( + ("xaxis" + str(xaxis + 1)), + ("yaxis" + str(yaxis + 1)), + ("x" + str(xaxis + 1)), + ("y" + str(yaxis + 1)), + ) + + @classmethod + def from_json( + cls, json_dict: dict, ignore_errors: bool = False + ) -> Union["AbsFeatureStat", None]: + try: + return cls.__from_json__(json_dict=json_dict) + except Exception as e: + if ignore_errors: + return None + else: + raise e diff --git a/ads/feature_store/statistics/charts/box_plot.py b/ads/feature_store/statistics/charts/box_plot.py new file mode 100644 index 000000000..0923a8412 --- /dev/null +++ b/ads/feature_store/statistics/charts/box_plot.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +from typing import List + +from ads.common.decorator.runtime_dependency import OptionalDependency +from ads.feature_store.statistics.charts.abstract_feature_stat import AbsFeatureStat +from ads.feature_store.statistics.charts.frequency_distribution import ( + FrequencyDistribution, +) +from ads.feature_store.statistics.generic_feature_value import GenericFeatureValue + +try: + from plotly.graph_objs import Figure +except ModuleNotFoundError: + raise ModuleNotFoundError( + f"The `plotly` module was not found. Please run `pip install " + f"{OptionalDependency.FEATURE_STORE}`." + ) + + +class BoxPlot(AbsFeatureStat): + CONST_MIN = "Min" + CONST_MAX = "Max" + CONST_QUARTILES = "Quartiles" + CONST_SD = "StandardDeviation" + CONST_MEAN = "Mean" + CONST_BOX_PLOT_TITLE = "Box Plot" + CONST_IQR = "IQR" + CONST_BOX_POINTS = "box_points" + + class Quartiles: + CONST_Q1 = "q1" + CONST_Q2 = "q2" + CONST_Q3 = "q3" + + def __init__(self, q1: float, q2: float, q3: float): + self.q1 = q1 + self.q2 = q2 + self.q3 = q3 + + @classmethod + def from_json(cls, json_dict: dict) -> "BoxPlot.Quartiles": + return cls( + json_dict.get(cls.CONST_Q1), + json_dict.get(cls.CONST_Q2), + json_dict.get(cls.CONST_Q3), + ) + + def __init__( + self, + mean: float, + median: float, + sd: float, + q1: float, + q3: float, + box_points: List[float], + ): + self.mean = mean + self.median = median + self.q1 = q1 + self.q3 = q3 + self.sd = sd + self.iqr = self.q3 - self.q1 + self.box_points = box_points + super().__init__() + + def __validate__(self): + if ( + self.q1 is None + or self.q3 is None + or self.iqr is None + or type(self.box_points) is not list + or len(self.box_points) == 0 + ): + return self.ValidationFailedException() + + def add_to_figure(self, fig: Figure, xaxis: int, yaxis: int): + xaxis_str, yaxis_str, x_str, y_str = self.get_x_y_str_axes(xaxis, yaxis) + fig.add_box( + notched=False, + boxmean=False, + mean=[self.mean], + median=[self.median], + q1=[self.q1], + q3=[self.q3], + sd=[self.sd], + y=[self.box_points], + upperfence=[self.q3 + 1.5 * self.iqr], + lowerfence=[self.q1 - 1.5 * self.iqr], + xaxis=x_str, + yaxis=y_str, + name="", + jitter=0, + ) + fig.layout.annotations[xaxis].text = self.CONST_BOX_PLOT_TITLE + fig.layout[yaxis_str]["title"] = "Values" + + @staticmethod + def get_box_points_from_frequency_distribution( + frequency_distribution: FrequencyDistribution, + ) -> List[float]: + # box_points = [] + if ( + frequency_distribution is not None + and frequency_distribution.frequency is not None + and frequency_distribution.bins is not None + ): + return [ + bin_dist + for frequency, bin_dist in zip( + frequency_distribution.frequency, frequency_distribution.bins + ) + if frequency > 0 + ] + else: + return [] + + @classmethod + def __from_json__(cls, json_dict: dict) -> "BoxPlot": + quartiles = cls.Quartiles.from_json(json_dict.get(cls.CONST_QUARTILES)) + return cls( + mean=GenericFeatureValue.from_json(json_dict.get(cls.CONST_MEAN)).val, + median=quartiles.q2, + sd=GenericFeatureValue.from_json(json_dict.get(cls.CONST_SD)).val, + q1=quartiles.q1, + q3=quartiles.q3, + box_points=json_dict.get(cls.CONST_BOX_POINTS), + ) diff --git a/ads/feature_store/statistics/charts/frequency_distribution.py b/ads/feature_store/statistics/charts/frequency_distribution.py new file mode 100644 index 000000000..697c4ff72 --- /dev/null +++ b/ads/feature_store/statistics/charts/frequency_distribution.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +from typing import List +from ads.common.decorator.runtime_dependency import OptionalDependency +from ads.feature_store.statistics.charts.abstract_feature_stat import AbsFeatureStat + +try: + from plotly.graph_objs import Figure +except ModuleNotFoundError: + raise ModuleNotFoundError( + f"The `plotly` module was not found. Please run `pip install " + f"{OptionalDependency.FEATURE_STORE}`." + ) + + +class FrequencyDistribution(AbsFeatureStat): + CONST_FREQUENCY = "frequency" + CONST_BINS = "bins" + CONST_FREQUENCY_DISTRIBUTION_TITLE = "Frequency Distribution" + + def __validate__(self): + if not ( + type(self.frequency) == list + and type(self.bins) == list + and 0 < len(self.frequency) == len(self.bins) > 0 + ): + raise self.ValidationFailedException() + + def __init__(self, frequency: List, bins: List): + self.frequency = frequency + self.bins = bins + super().__init__() + + @classmethod + def __from_json__(cls, json_dict: dict) -> "FrequencyDistribution": + return FrequencyDistribution( + frequency=json_dict.get(cls.CONST_FREQUENCY), + bins=json_dict.get(cls.CONST_BINS), + ) + + def add_to_figure(self, fig: Figure, xaxis: int, yaxis: int): + xaxis_str, yaxis_str, x_str, y_str = self.get_x_y_str_axes(xaxis, yaxis) + if ( + type(self.frequency) == list + and type(self.bins) == list + and 0 < len(self.frequency) == len(self.bins) > 0 + ): + fig.add_bar( + x=self.bins, y=self.frequency, xaxis=x_str, yaxis=y_str, name="" + ) + fig.layout.annotations[xaxis].text = self.CONST_FREQUENCY_DISTRIBUTION_TITLE + fig.layout[xaxis_str]["title"] = "Bins" + fig.layout[yaxis_str]["title"] = "Frequency" diff --git a/ads/feature_store/statistics/charts/probability_distribution.py b/ads/feature_store/statistics/charts/probability_distribution.py new file mode 100644 index 000000000..d64be76fa --- /dev/null +++ b/ads/feature_store/statistics/charts/probability_distribution.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +from typing import List + +from ads.common.decorator.runtime_dependency import OptionalDependency +from ads.feature_store.statistics.charts.abstract_feature_stat import AbsFeatureStat + +try: + from plotly.graph_objs import Figure +except ModuleNotFoundError: + raise ModuleNotFoundError( + f"The `plotly` module was not found. Please run `pip install " + f"{OptionalDependency.FEATURE_STORE}`." + ) + + +class ProbabilityDistribution(AbsFeatureStat): + def __validate__(self): + if not ( + type(self.density) == list + and type(self.bins) == list + and 0 < len(self.density) == len(self.bins) > 0 + ): + raise self.ValidationFailedException() + + CONST_DENSITY = "density" + CONST_BINS = "bins" + CONST_PROBABILITY_DISTRIBUTION_TITLE = "Probability Distribution" + + def __init__(self, density: List, bins: List): + self.density = density + self.bins = bins + super().__init__() + + @classmethod + def __from_json__(cls, json_dict: dict) -> "ProbabilityDistribution": + return cls( + density=json_dict.get(ProbabilityDistribution.CONST_DENSITY), + bins=json_dict.get(ProbabilityDistribution.CONST_BINS), + ) + + def add_to_figure(self, fig: Figure, xaxis: int, yaxis: int): + xaxis_str, yaxis_str, x_str, y_str = self.get_x_y_str_axes(xaxis, yaxis) + if ( + type(self.density) == list + and type(self.bins) == list + and 0 < len(self.density) == len(self.bins) > 0 + ): + fig.add_bar( + x=self.bins, + y=self.density, + xaxis=x_str, + yaxis=y_str, + name="", + ) + fig.layout.annotations[xaxis].text = self.CONST_PROBABILITY_DISTRIBUTION_TITLE + fig.layout[xaxis_str]["title"] = "Bins" + fig.layout[yaxis_str]["title"] = "Density" diff --git a/ads/feature_store/statistics/charts/top_k_frequent_elements.py b/ads/feature_store/statistics/charts/top_k_frequent_elements.py new file mode 100644 index 000000000..d68840890 --- /dev/null +++ b/ads/feature_store/statistics/charts/top_k_frequent_elements.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +from typing import List +from ads.common.decorator.runtime_dependency import OptionalDependency + +from ads.feature_store.statistics.charts.abstract_feature_stat import AbsFeatureStat + +try: + from plotly.graph_objs import Figure +except ModuleNotFoundError: + raise ModuleNotFoundError( + f"The `plotly` module was not found. Please run `pip install " + f"{OptionalDependency.FEATURE_STORE}`." + ) + + +class TopKFrequentElements(AbsFeatureStat): + def __validate__(self): + if not (type(self.elements) == list and len(self.elements) > 0): + raise self.ValidationFailedException + + CONST_VALUE = "value" + CONST_TOP_K_FREQUENT_TITLE = "Top K Frequent Elements" + + class TopKFrequentElement: + CONST_VALUE = "value" + CONST_ESTIMATE = "estimate" + CONST_LOWER_BOUND = "lower_bound" + CONST_UPPER_BOUND = "upper_bound" + + def __init__( + self, value: str, estimate: int, lower_bound: int, upper_bound: int + ): + self.value = value + self.estimate = estimate + self.lower_bound = lower_bound + self.upper_bound = upper_bound + + @classmethod + def from_json( + cls, json_dict: dict + ) -> "TopKFrequentElements.TopKFrequentElement": + return cls( + value=json_dict.get(cls.CONST_VALUE), + estimate=json_dict.get(cls.CONST_ESTIMATE), + lower_bound=json_dict.get(cls.CONST_LOWER_BOUND), + upper_bound=json_dict.get(cls.CONST_UPPER_BOUND), + ) + + def __init__(self, elements: List[TopKFrequentElement]): + self.elements = elements + super().__init__() + + @classmethod + def __from_json__(cls, json_dict: dict) -> "TopKFrequentElements": + elements = json_dict.get(cls.CONST_VALUE) + return cls([cls.TopKFrequentElement.from_json(element) for element in elements]) + + def add_to_figure(self, fig: Figure, xaxis: int, yaxis: int): + xaxis_str, yaxis_str, x_str, y_str = self.get_x_y_str_axes(xaxis, yaxis) + if type(self.elements) == list and len(self.elements) > 0: + y_axis = [element.value for element in self.elements] + x_axis = [element.estimate for element in self.elements] + fig.add_bar( + x=x_axis, y=y_axis, xaxis=x_str, yaxis=y_str, name="", orientation="h" + ) + fig.layout.annotations[xaxis].text = self.CONST_TOP_K_FREQUENT_TITLE + fig.layout[yaxis_str]["title"] = "Element" + fig.layout[xaxis_str]["title"] = "Count" diff --git a/ads/feature_store/statistics/feature_stat.py b/ads/feature_store/statistics/feature_stat.py new file mode 100644 index 000000000..27a7e9647 --- /dev/null +++ b/ads/feature_store/statistics/feature_stat.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +from ads.common.decorator.runtime_dependency import OptionalDependency +from typing import List +from ads.feature_store.statistics.charts.abstract_feature_stat import AbsFeatureStat +from ads.feature_store.statistics.charts.box_plot import BoxPlot +from ads.feature_store.statistics.charts.frequency_distribution import ( + FrequencyDistribution, +) +from ads.feature_store.statistics.charts.probability_distribution import ( + ProbabilityDistribution, +) +from ads.feature_store.statistics.charts.top_k_frequent_elements import ( + TopKFrequentElements, +) + +try: + import plotly + from plotly.graph_objs import Figure + import plotly.graph_objects as go + from plotly.subplots import make_subplots +except ModuleNotFoundError: + raise ModuleNotFoundError( + f"The `plotly` module was not found. Please run `pip install " + f"{OptionalDependency.FEATURE_STORE}`." + ) + + +class FeatureStatistics: + CONST_FREQUENCY_DISTRIBUTION = "FrequencyDistribution" + CONST_TITLE_FORMAT = "{}" + CONST_PLOT_FORMAT = "{}_plot" + CONST_PROBABILITY_DISTRIBUTION = "ProbabilityDistribution" + CONST_TOP_K_FREQUENT = "TopKFrequentElements" + + def __init__( + self, + feature_name: str, + top_k_frequent_elements: TopKFrequentElements = None, + frequency_distribution: FrequencyDistribution = None, + probability_distribution: ProbabilityDistribution = None, + box_plot: BoxPlot = None, + ): + self.feature_name: str = feature_name + self.top_k_frequent_elements = top_k_frequent_elements + self.frequency_distribution = frequency_distribution + self.probability_distribution = probability_distribution + self.box_plot = box_plot + + @classmethod + def from_json(cls, feature_name: str, json_dict: dict) -> "FeatureStatistics": + if json_dict is not None: + frequency_distribution = FrequencyDistribution.from_json( + json_dict.get(cls.CONST_FREQUENCY_DISTRIBUTION), ignore_errors=True + ) + + # inject box points for boxplot creation + json_dict[ + BoxPlot.CONST_BOX_POINTS + ] = BoxPlot.get_box_points_from_frequency_distribution( + frequency_distribution + ) + return cls( + feature_name, + TopKFrequentElements.from_json( + json_dict.get(cls.CONST_TOP_K_FREQUENT), ignore_errors=True + ), + frequency_distribution, + ProbabilityDistribution.from_json( + json_dict.get(cls.CONST_PROBABILITY_DISTRIBUTION), + ignore_errors=True, + ), + BoxPlot.from_json(json_dict, ignore_errors=True), + ) + else: + return cls(feature_name) + + @property + def __feature_stat_objects__(self) -> List[AbsFeatureStat]: + return [ + stat + for stat in [ + self.box_plot, + self.top_k_frequent_elements, + self.frequency_distribution, + self.probability_distribution, + ] + if stat is not None + ] + + def to_viz(self): + # TODO: make it generic + def next_graph_position_generator(): + yield 1 + yield 0 + yield 2 + + if len(self.__feature_stat_objects__) > 0: + fig = make_subplots(cols=3, column_titles=[" "] * 3) + for idx, stat in zip( + next_graph_position_generator(), + [stat for stat in self.__feature_stat_objects__ if stat is not None], + ): + stat.add_to_figure(fig, idx, idx) + + fig.layout.title = self.CONST_TITLE_FORMAT.format(self.feature_name) + fig.update_layout(title_font_size=20) + fig.update_layout(title_x=0.5) + fig.update_layout(showlegend=False) + plotly.offline.iplot( + fig, + filename=self.CONST_PLOT_FORMAT.format(self.feature_name), + ) + else: + print( + f"No statistical information for feature {self.feature_name} can be visualised" + ) diff --git a/ads/feature_store/statistics/generic_feature_value.py b/ads/feature_store/statistics/generic_feature_value.py new file mode 100644 index 000000000..6b52e3001 --- /dev/null +++ b/ads/feature_store/statistics/generic_feature_value.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- + + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +class GenericFeatureValue: + CONST_VALUE = "value" + + def __init__(self, val: any): + self.val = val + + @classmethod + def from_json(cls, json_dict: dict) -> "GenericFeatureValue": + return GenericFeatureValue( + val=json_dict.get(cls.CONST_VALUE), + ) diff --git a/ads/feature_store/statistics/statistics.py b/ads/feature_store/statistics/statistics.py new file mode 100644 index 000000000..f8b09ff0c --- /dev/null +++ b/ads/feature_store/statistics/statistics.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python +# -*- coding: utf-8; -*- +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + +from typing import List +from ads.feature_store.statistics.feature_stat import FeatureStatistics +from ads.feature_store.response.response_builder import ResponseBuilder +import json + + +class Statistics(ResponseBuilder): + """ + Represents statistical information. + """ + + @property + def kind(self) -> str: + """ + Gets the kind of the statistics object. + + Returns + ------- + str + The kind of the statistics object, which is always "statistics". + """ + return "statistics" + + def to_viz(self, feature_list: List[str] = None): + """Visualises statistics inside notebook + Parameters + ---------- + feature_list: (str, optional). Defaults to `None`. + The specific features of the FeatureGroup or Dataset we want to visualise + """ + if self.content is not None: + [ + FeatureStatistics.from_json(feature, stat).to_viz() + for feature, stat in json.loads(self.content).items() + if (feature_list is None or feature in feature_list) + ] diff --git a/setup.py b/setup.py index 51bb24561..379f97f01 100644 --- a/setup.py +++ b/setup.py @@ -76,7 +76,8 @@ "pyspark>=3.0.0", "delta-spark", "great-expectations==0.15.39", - "pyarrow" + "pyarrow", + "plotly" ], "mlm_insights": ["mlm_insights==0.1.0.dev1"], }