From 5f62c117178867dea04e2d7f5d757e1c4a4adfe2 Mon Sep 17 00:00:00 2001 From: alexmindset Date: Sat, 3 Dec 2022 21:21:07 +0300 Subject: [PATCH] Failing test under Windows fixed. Type annotation. --- .../chi2_homogeneity_test.py | 2 +- .../feature_monitoring/homogeneity_report.py | 34 +++++++------- .../feature_monitoring/homogeneity_tests.py | 47 +++++++++++++++---- .../psi_homogeneity_test.py | 9 ++-- tests/test_ContinuousHomogeneityTests.py | 4 +- tests/test_DiscreteHomogeneityTests.py | 4 +- 6 files changed, 65 insertions(+), 35 deletions(-) diff --git a/insolver/feature_monitoring/chi2_homogeneity_test.py b/insolver/feature_monitoring/chi2_homogeneity_test.py index d2ad84b..4ee25d4 100644 --- a/insolver/feature_monitoring/chi2_homogeneity_test.py +++ b/insolver/feature_monitoring/chi2_homogeneity_test.py @@ -18,7 +18,7 @@ def __init__(self, statistic: float, pvalue: float): self.pvalue = pvalue -def chi2_discr_2samp(x1: np.ndarray, x2: np.ndarray): +def chi2_discr_2samp(x1: np.ndarray, x2: np.ndarray) -> "Chi2Result": """ This function runs chi-square test checking homogeneity of two samples of discrete variables. diff --git a/insolver/feature_monitoring/homogeneity_report.py b/insolver/feature_monitoring/homogeneity_report.py index a4f6f6a..41792f8 100644 --- a/insolver/feature_monitoring/homogeneity_report.py +++ b/insolver/feature_monitoring/homogeneity_report.py @@ -1,20 +1,20 @@ +import os +import inspect import numpy as np import pandas as pd -from plotly.figure_factory import create_distplot -from plotly import express as px import plotly as py -import os +import jinja2 from os.path import dirname -import inspect +from typing import List, Sequence, Dict, Union +from plotly.figure_factory import create_distplot +from plotly import express as px from .homogeneity_tests import ContinuousHomogeneityTests, DiscreteHomogeneityTests, fillna_cont, fillna_discr -import jinja2 - def chart_cont( - x1: np.ndarray, x2: np.ndarray, name1: str, name2: str, limits: iter, bins: int = 15, offline: bool = True -): + x1: np.ndarray, x2: np.ndarray, name1: str, name2: str, limits: Sequence, bins: int = 15, offline: bool = True +) -> py.graph_objs.Figure: """ This function draws histograms of given samples using joint grid. It needs limits of interested area and number of bins. @@ -71,7 +71,7 @@ def chart_cont( return fig -def chart_discr(x1: np.ndarray, x2: np.ndarray, name1: str, name2: str, offline: bool = True): +def chart_discr(x1: np.ndarray, x2: np.ndarray, name1: str, name2: str, offline: bool = True) -> py.graph_objs.Figure: """ This function draws histograms of given samples using joint grid. It needs limits of interested area and number of bins. @@ -144,15 +144,15 @@ def __init__(self, config_dict_inp: dict): self.config_dict = config_dict_inp @property - def features(self): - return self.__config_dict.keys() + def features(self) -> List: + return list(self.__config_dict.keys()) @property - def config_dict(self): + def config_dict(self) -> Dict: return self.__config_dict @config_dict.setter - def config_dict(self, config_dict_inp): + def config_dict(self, config_dict_inp: Dict) -> None: """ Raises: ValueError: if config_dict is empty. It must have some features. @@ -182,7 +182,7 @@ def build_report( name1: str = 'Base subset', name2: str = 'Current subset', draw_charts: bool = False, - ): + ) -> List: """ Main function which assembles all testing logic - it takes raw dataframes and runs homogeneity tests for features. Feature set and properties are took from config dict. @@ -270,7 +270,9 @@ def build_report( x1, x2, _ = fillna_cont(x1, x2, inplace=True) # run tests - homogen_tester = ContinuousHomogeneityTests(pval_thresh, samp_size, bootstrap_num, psi_bins) + homogen_tester: Union[ + 'ContinuousHomogeneityTests', 'DiscreteHomogeneityTests' + ] = ContinuousHomogeneityTests(pval_thresh, samp_size, bootstrap_num, psi_bins) test_results = homogen_tester.run_all(x1, x2, inplace=True) # optional drawing of charts @@ -328,7 +330,7 @@ def build_report( return report_data -def render_report(report_data: list, report_path: str = 'homogeneity_report.html'): +def render_report(report_data: list, report_path: str = 'homogeneity_report.html') -> None: """ This is a separate function to render reports built by 'HomogeneityReport' class. Several report data lists can be concatenated and passed to this function. diff --git a/insolver/feature_monitoring/homogeneity_tests.py b/insolver/feature_monitoring/homogeneity_tests.py index 321431b..dc03d50 100644 --- a/insolver/feature_monitoring/homogeneity_tests.py +++ b/insolver/feature_monitoring/homogeneity_tests.py @@ -2,11 +2,12 @@ import pandas as pd from scipy import stats as sps from sklearn.preprocessing import LabelEncoder +from typing import Callable, List, Any from .chi2_homogeneity_test import chi2_discr_2samp from .psi_homogeneity_test import psi_discr_2samp, psi_cont_2samp, sec_min -def gen_sample(x: np.ndarray, samp_size: int, replace: bool = False): +def gen_sample(x: np.ndarray, samp_size: int, replace: bool = False) -> np.ndarray: """ This function generates subsample of given size from main sample (without replaces by default). @@ -24,7 +25,7 @@ def gen_sample(x: np.ndarray, samp_size: int, replace: bool = False): return samp -def bootstrap(x1: np.ndarray, x2: np.ndarray, bootstrap_num: int, samp_size: int, test: callable): +def bootstrap(x1: np.ndarray, x2: np.ndarray, bootstrap_num: int, samp_size: int, test: Callable) -> float: """ This function runs same test many times on subsamples of main 2 samples. Counted pvalues are used to get average estimate of pvalue. (Bootstrap idea). @@ -53,7 +54,7 @@ def bootstrap(x1: np.ndarray, x2: np.ndarray, bootstrap_num: int, samp_size: int return pvalue -def fillna_discr(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False): +def fillna_discr(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -> Any: """ This function fills missing values in x1 and x2 safely for homogeneity tests. It guarantees that missing values will be filled with unique constant. @@ -93,7 +94,7 @@ def fillna_discr(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False): return x1, x2, 'nan' -def fillna_cont(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False): +def fillna_cont(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -> Any: """ This function fills missing values in x1 and x2 safely for homogeneity tests. In case when nan value is just set to some constant less than all elements @@ -168,7 +169,7 @@ def __init__(self, pval_thresh: float, samp_size: int, bootstrap_num: int): self.samp_size = samp_size self.bootstrap_num = bootstrap_num - def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False): + def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -> List: """ Runs all discrete tests for two samples: 'chi2', 'psi'. @@ -198,8 +199,21 @@ def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) if x1_ref.dtype != x2_ref.dtype: raise TypeError("x1 and x2 must be of same data type.") - if x1_ref.dtype not in [int, float, object]: - raise TypeError("Only int, float or object datatypes are supported as x1/x2.dtype.") + if x1_ref.dtype not in [ + 'int8', + 'int16', + 'int32', + 'int64', + 'uint8', + 'uint16', + 'uint32', + 'uint64', + 'float16', + 'float32', + 'float64', + 'object', + ]: + raise TypeError(f"Only int, float or object datatypes are supported as x1/x2.dtype. Got {x1_ref.dtype}.") if (x1_ref.shape[0] < self.samp_size) or (x2_ref.shape[0] < self.samp_size): raise ValueError("Sizes of x1 and x2 must be not less than 'samp_size' attribute.") @@ -287,7 +301,7 @@ def __init__(self, pval_thresh: float, samp_size: int, bootstrap_num: int, psi_b self.bootstrap_num = bootstrap_num self.psi_bins = psi_bins - def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False): + def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -> List: """ Runs all continuous tests for two samples: 'ks', 'cr-vonmis', 'epps-sing', 'psi'. @@ -317,8 +331,21 @@ def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) if x1_ref.dtype != x2_ref.dtype: raise TypeError("x1 and x2 must be of same data type.") - if x1_ref.dtype not in [int, float, object]: - raise TypeError("Only int, float or object datatypes are supported as x1/x2.dtype.") + if x1_ref.dtype not in [ + 'int8', + 'int16', + 'int32', + 'int64', + 'uint8', + 'uint16', + 'uint32', + 'uint64', + 'float16', + 'float32', + 'float64', + 'object', + ]: + raise TypeError(f"Only int, float or object datatypes are supported as x1/x2.dtype. Got {x1_ref.dtype}.") if (x1_ref.shape[0] < self.samp_size) or (x2_ref.shape[0] < self.samp_size): raise ValueError("Sizes of x1 and x2 must be not less than 'samp_size' attribute.") diff --git a/insolver/feature_monitoring/psi_homogeneity_test.py b/insolver/feature_monitoring/psi_homogeneity_test.py index c80117f..61b7b9a 100644 --- a/insolver/feature_monitoring/psi_homogeneity_test.py +++ b/insolver/feature_monitoring/psi_homogeneity_test.py @@ -1,9 +1,10 @@ import numpy as np from math import inf from collections import defaultdict +from typing import Iterable, Union -def sec_min(x): +def sec_min(x: Iterable) -> Union[float, int]: """ This function counts second minimum of an array. @@ -23,7 +24,7 @@ def sec_min(x): return min2 -def psi_cont_2samp(x1: np.ndarray, x2: np.ndarray, nan_value: float = -1.0, buckets: int = 20): +def psi_cont_2samp(x1: np.ndarray, x2: np.ndarray, nan_value: float = -1.0, buckets: int = 20) -> float: """ This function counts population stability index (PSI) between two samples of continuous variables. @@ -47,7 +48,7 @@ def psi_cont_2samp(x1: np.ndarray, x2: np.ndarray, nan_value: float = -1.0, buck # build grid for histograms min_ = min(np.min(x1), np.min(x2)) max_ = max(np.max(x1), np.max(x2)) - grid = [] + grid = np.array([]) if min_ > nan_value: grid = np.linspace(min_, max_, buckets + 1) elif min_ == nan_value: @@ -70,7 +71,7 @@ def psi_cont_2samp(x1: np.ndarray, x2: np.ndarray, nan_value: float = -1.0, buck return psi_value -def psi_discr_2samp(x1: np.ndarray, x2: np.ndarray): +def psi_discr_2samp(x1: np.ndarray, x2: np.ndarray) -> float: """ This function counts psi_value between two samples of discrete variables. diff --git a/tests/test_ContinuousHomogeneityTests.py b/tests/test_ContinuousHomogeneityTests.py index 93e3520..79b6974 100644 --- a/tests/test_ContinuousHomogeneityTests.py +++ b/tests/test_ContinuousHomogeneityTests.py @@ -1,10 +1,10 @@ +import os +import pytest import numpy as np import pandas as pd from scipy import stats as sps -import pytest from insolver.feature_monitoring import ContinuousHomogeneityTests from insolver.feature_monitoring import psi_cont_2samp -import os from insolver.model_tools import download_dataset diff --git a/tests/test_DiscreteHomogeneityTests.py b/tests/test_DiscreteHomogeneityTests.py index 8e25f52..9d6f67b 100644 --- a/tests/test_DiscreteHomogeneityTests.py +++ b/tests/test_DiscreteHomogeneityTests.py @@ -1,9 +1,9 @@ +import os +import pytest import numpy as np import pandas as pd from scipy import stats as sps -import pytest from insolver.feature_monitoring import DiscreteHomogeneityTests -import os from insolver.model_tools import download_dataset