Skip to content

Commit

Permalink
Failing test under Windows fixed. Type annotation.
Browse files Browse the repository at this point in the history
  • Loading branch information
alexmindset committed Dec 3, 2022
1 parent 955da23 commit 5f62c11
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 35 deletions.
2 changes: 1 addition & 1 deletion insolver/feature_monitoring/chi2_homogeneity_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def __init__(self, statistic: float, pvalue: float):
self.pvalue = pvalue


def chi2_discr_2samp(x1: np.ndarray, x2: np.ndarray):
def chi2_discr_2samp(x1: np.ndarray, x2: np.ndarray) -> "Chi2Result":
"""
This function runs chi-square test checking homogeneity of two samples
of discrete variables.
Expand Down
34 changes: 18 additions & 16 deletions insolver/feature_monitoring/homogeneity_report.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
import os
import inspect
import numpy as np
import pandas as pd
from plotly.figure_factory import create_distplot
from plotly import express as px
import plotly as py
import os
import jinja2
from os.path import dirname
import inspect
from typing import List, Sequence, Dict, Union
from plotly.figure_factory import create_distplot
from plotly import express as px

from .homogeneity_tests import ContinuousHomogeneityTests, DiscreteHomogeneityTests, fillna_cont, fillna_discr

import jinja2


def chart_cont(
x1: np.ndarray, x2: np.ndarray, name1: str, name2: str, limits: iter, bins: int = 15, offline: bool = True
):
x1: np.ndarray, x2: np.ndarray, name1: str, name2: str, limits: Sequence, bins: int = 15, offline: bool = True
) -> py.graph_objs.Figure:
"""
This function draws histograms of given samples using joint grid.
It needs limits of interested area and number of bins.
Expand Down Expand Up @@ -71,7 +71,7 @@ def chart_cont(
return fig


def chart_discr(x1: np.ndarray, x2: np.ndarray, name1: str, name2: str, offline: bool = True):
def chart_discr(x1: np.ndarray, x2: np.ndarray, name1: str, name2: str, offline: bool = True) -> py.graph_objs.Figure:
"""
This function draws histograms of given samples using joint grid.
It needs limits of interested area and number of bins.
Expand Down Expand Up @@ -144,15 +144,15 @@ def __init__(self, config_dict_inp: dict):
self.config_dict = config_dict_inp

@property
def features(self):
return self.__config_dict.keys()
def features(self) -> List:
return list(self.__config_dict.keys())

@property
def config_dict(self):
def config_dict(self) -> Dict:
return self.__config_dict

@config_dict.setter
def config_dict(self, config_dict_inp):
def config_dict(self, config_dict_inp: Dict) -> None:
"""
Raises:
ValueError: if config_dict is empty. It must have some features.
Expand Down Expand Up @@ -182,7 +182,7 @@ def build_report(
name1: str = 'Base subset',
name2: str = 'Current subset',
draw_charts: bool = False,
):
) -> List:
"""
Main function which assembles all testing logic - it takes raw dataframes
and runs homogeneity tests for features. Feature set and properties are took from config dict.
Expand Down Expand Up @@ -270,7 +270,9 @@ def build_report(
x1, x2, _ = fillna_cont(x1, x2, inplace=True)

# run tests
homogen_tester = ContinuousHomogeneityTests(pval_thresh, samp_size, bootstrap_num, psi_bins)
homogen_tester: Union[
'ContinuousHomogeneityTests', 'DiscreteHomogeneityTests'
] = ContinuousHomogeneityTests(pval_thresh, samp_size, bootstrap_num, psi_bins)
test_results = homogen_tester.run_all(x1, x2, inplace=True)

# optional drawing of charts
Expand Down Expand Up @@ -328,7 +330,7 @@ def build_report(
return report_data


def render_report(report_data: list, report_path: str = 'homogeneity_report.html'):
def render_report(report_data: list, report_path: str = 'homogeneity_report.html') -> None:
"""
This is a separate function to render reports built by 'HomogeneityReport' class.
Several report data lists can be concatenated and passed to this function.
Expand Down
47 changes: 37 additions & 10 deletions insolver/feature_monitoring/homogeneity_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
import pandas as pd
from scipy import stats as sps
from sklearn.preprocessing import LabelEncoder
from typing import Callable, List, Any
from .chi2_homogeneity_test import chi2_discr_2samp
from .psi_homogeneity_test import psi_discr_2samp, psi_cont_2samp, sec_min


def gen_sample(x: np.ndarray, samp_size: int, replace: bool = False):
def gen_sample(x: np.ndarray, samp_size: int, replace: bool = False) -> np.ndarray:
"""
This function generates subsample of given size from main sample (without replaces by default).
Expand All @@ -24,7 +25,7 @@ def gen_sample(x: np.ndarray, samp_size: int, replace: bool = False):
return samp


def bootstrap(x1: np.ndarray, x2: np.ndarray, bootstrap_num: int, samp_size: int, test: callable):
def bootstrap(x1: np.ndarray, x2: np.ndarray, bootstrap_num: int, samp_size: int, test: Callable) -> float:
"""
This function runs same test many times on subsamples of main 2 samples.
Counted pvalues are used to get average estimate of pvalue. (Bootstrap idea).
Expand Down Expand Up @@ -53,7 +54,7 @@ def bootstrap(x1: np.ndarray, x2: np.ndarray, bootstrap_num: int, samp_size: int
return pvalue


def fillna_discr(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False):
def fillna_discr(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -> Any:
"""
This function fills missing values in x1 and x2 safely for homogeneity tests.
It guarantees that missing values will be filled with unique constant.
Expand Down Expand Up @@ -93,7 +94,7 @@ def fillna_discr(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False):
return x1, x2, 'nan'


def fillna_cont(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False):
def fillna_cont(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -> Any:
"""
This function fills missing values in x1 and x2 safely for homogeneity tests.
In case when nan value is just set to some constant less than all elements
Expand Down Expand Up @@ -168,7 +169,7 @@ def __init__(self, pval_thresh: float, samp_size: int, bootstrap_num: int):
self.samp_size = samp_size
self.bootstrap_num = bootstrap_num

def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False):
def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -> List:
"""
Runs all discrete tests for two samples: 'chi2', 'psi'.
Expand Down Expand Up @@ -198,8 +199,21 @@ def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False)
if x1_ref.dtype != x2_ref.dtype:
raise TypeError("x1 and x2 must be of same data type.")

if x1_ref.dtype not in [int, float, object]:
raise TypeError("Only int, float or object datatypes are supported as x1/x2.dtype.")
if x1_ref.dtype not in [
'int8',
'int16',
'int32',
'int64',
'uint8',
'uint16',
'uint32',
'uint64',
'float16',
'float32',
'float64',
'object',
]:
raise TypeError(f"Only int, float or object datatypes are supported as x1/x2.dtype. Got {x1_ref.dtype}.")

if (x1_ref.shape[0] < self.samp_size) or (x2_ref.shape[0] < self.samp_size):
raise ValueError("Sizes of x1 and x2 must be not less than 'samp_size' attribute.")
Expand Down Expand Up @@ -287,7 +301,7 @@ def __init__(self, pval_thresh: float, samp_size: int, bootstrap_num: int, psi_b
self.bootstrap_num = bootstrap_num
self.psi_bins = psi_bins

def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False):
def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -> List:
"""
Runs all continuous tests for two samples: 'ks', 'cr-vonmis', 'epps-sing', 'psi'.
Expand Down Expand Up @@ -317,8 +331,21 @@ def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False)
if x1_ref.dtype != x2_ref.dtype:
raise TypeError("x1 and x2 must be of same data type.")

if x1_ref.dtype not in [int, float, object]:
raise TypeError("Only int, float or object datatypes are supported as x1/x2.dtype.")
if x1_ref.dtype not in [
'int8',
'int16',
'int32',
'int64',
'uint8',
'uint16',
'uint32',
'uint64',
'float16',
'float32',
'float64',
'object',
]:
raise TypeError(f"Only int, float or object datatypes are supported as x1/x2.dtype. Got {x1_ref.dtype}.")

if (x1_ref.shape[0] < self.samp_size) or (x2_ref.shape[0] < self.samp_size):
raise ValueError("Sizes of x1 and x2 must be not less than 'samp_size' attribute.")
Expand Down
9 changes: 5 additions & 4 deletions insolver/feature_monitoring/psi_homogeneity_test.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import numpy as np
from math import inf
from collections import defaultdict
from typing import Iterable, Union


def sec_min(x):
def sec_min(x: Iterable) -> Union[float, int]:
"""
This function counts second minimum of an array.
Expand All @@ -23,7 +24,7 @@ def sec_min(x):
return min2


def psi_cont_2samp(x1: np.ndarray, x2: np.ndarray, nan_value: float = -1.0, buckets: int = 20):
def psi_cont_2samp(x1: np.ndarray, x2: np.ndarray, nan_value: float = -1.0, buckets: int = 20) -> float:
"""
This function counts population stability index (PSI)
between two samples of continuous variables.
Expand All @@ -47,7 +48,7 @@ def psi_cont_2samp(x1: np.ndarray, x2: np.ndarray, nan_value: float = -1.0, buck
# build grid for histograms
min_ = min(np.min(x1), np.min(x2))
max_ = max(np.max(x1), np.max(x2))
grid = []
grid = np.array([])
if min_ > nan_value:
grid = np.linspace(min_, max_, buckets + 1)
elif min_ == nan_value:
Expand All @@ -70,7 +71,7 @@ def psi_cont_2samp(x1: np.ndarray, x2: np.ndarray, nan_value: float = -1.0, buck
return psi_value


def psi_discr_2samp(x1: np.ndarray, x2: np.ndarray):
def psi_discr_2samp(x1: np.ndarray, x2: np.ndarray) -> float:
"""
This function counts psi_value between two samples of discrete variables.
Expand Down
4 changes: 2 additions & 2 deletions tests/test_ContinuousHomogeneityTests.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import os
import pytest
import numpy as np
import pandas as pd
from scipy import stats as sps
import pytest
from insolver.feature_monitoring import ContinuousHomogeneityTests
from insolver.feature_monitoring import psi_cont_2samp
import os
from insolver.model_tools import download_dataset


Expand Down
4 changes: 2 additions & 2 deletions tests/test_DiscreteHomogeneityTests.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import os
import pytest
import numpy as np
import pandas as pd
from scipy import stats as sps
import pytest
from insolver.feature_monitoring import DiscreteHomogeneityTests
import os
from insolver.model_tools import download_dataset


Expand Down

0 comments on commit 5f62c11

Please sign in to comment.