Failing test under Windows fixed. Type annotation.

MindSetLib · Dec 3, 2022 · 5f62c11 · 5f62c11
1 parent 955da23
commit 5f62c11
Show file tree

Hide file tree

Showing 6 changed files with 65 additions and 35 deletions.
diff --git a/insolver/feature_monitoring/chi2_homogeneity_test.py b/insolver/feature_monitoring/chi2_homogeneity_test.py
@@ -18,7 +18,7 @@ def __init__(self, statistic: float, pvalue: float):
         self.pvalue = pvalue
 
 
-def chi2_discr_2samp(x1: np.ndarray, x2: np.ndarray):
+def chi2_discr_2samp(x1: np.ndarray, x2: np.ndarray) -> "Chi2Result":
     """
     This function runs chi-square test checking homogeneity of two samples
     of discrete variables.

diff --git a/insolver/feature_monitoring/homogeneity_report.py b/insolver/feature_monitoring/homogeneity_report.py
@@ -1,20 +1,20 @@
+import os
+import inspect
 import numpy as np
 import pandas as pd
-from plotly.figure_factory import create_distplot
-from plotly import express as px
 import plotly as py
-import os
+import jinja2
 from os.path import dirname
-import inspect
+from typing import List, Sequence, Dict, Union
+from plotly.figure_factory import create_distplot
+from plotly import express as px
 
 from .homogeneity_tests import ContinuousHomogeneityTests, DiscreteHomogeneityTests, fillna_cont, fillna_discr
 
-import jinja2
-
 
 def chart_cont(
-    x1: np.ndarray, x2: np.ndarray, name1: str, name2: str, limits: iter, bins: int = 15, offline: bool = True
-):
+    x1: np.ndarray, x2: np.ndarray, name1: str, name2: str, limits: Sequence, bins: int = 15, offline: bool = True
+) -> py.graph_objs.Figure:
     """
     This function draws histograms of given samples using joint grid.
     It needs limits of interested area and number of bins.
@@ -71,7 +71,7 @@ def chart_cont(
         return fig
 
 
-def chart_discr(x1: np.ndarray, x2: np.ndarray, name1: str, name2: str, offline: bool = True):
+def chart_discr(x1: np.ndarray, x2: np.ndarray, name1: str, name2: str, offline: bool = True) -> py.graph_objs.Figure:
     """
     This function draws histograms of given samples using joint grid.
     It needs limits of interested area and number of bins.
@@ -144,15 +144,15 @@ def __init__(self, config_dict_inp: dict):
         self.config_dict = config_dict_inp
 
     @property
-    def features(self):
-        return self.__config_dict.keys()
+    def features(self) -> List:
+        return list(self.__config_dict.keys())
 
     @property
-    def config_dict(self):
+    def config_dict(self) -> Dict:
         return self.__config_dict
 
     @config_dict.setter
-    def config_dict(self, config_dict_inp):
+    def config_dict(self, config_dict_inp: Dict) -> None:
         """
         Raises:
             ValueError: if config_dict is empty. It must have some features.
@@ -182,7 +182,7 @@ def build_report(
         name1: str = 'Base subset',
         name2: str = 'Current subset',
         draw_charts: bool = False,
-    ):
+    ) -> List:
         """
         Main function which assembles all testing logic - it takes raw dataframes
         and runs homogeneity tests for features. Feature set and properties are took from config dict.
@@ -270,7 +270,9 @@ def build_report(
                 x1, x2, _ = fillna_cont(x1, x2, inplace=True)
 
                 # run tests
-                homogen_tester = ContinuousHomogeneityTests(pval_thresh, samp_size, bootstrap_num, psi_bins)
+                homogen_tester: Union[
+                    'ContinuousHomogeneityTests', 'DiscreteHomogeneityTests'
+                ] = ContinuousHomogeneityTests(pval_thresh, samp_size, bootstrap_num, psi_bins)
                 test_results = homogen_tester.run_all(x1, x2, inplace=True)
 
                 # optional drawing of charts
@@ -328,7 +330,7 @@ def build_report(
         return report_data
 
 
-def render_report(report_data: list, report_path: str = 'homogeneity_report.html'):
+def render_report(report_data: list, report_path: str = 'homogeneity_report.html') -> None:
     """
     This is a separate function to render reports built by 'HomogeneityReport' class.
     Several report data lists can be concatenated and passed to this function.

diff --git a/insolver/feature_monitoring/homogeneity_tests.py b/insolver/feature_monitoring/homogeneity_tests.py
@@ -2,11 +2,12 @@
 import pandas as pd
 from scipy import stats as sps
 from sklearn.preprocessing import LabelEncoder
+from typing import Callable, List, Any
 from .chi2_homogeneity_test import chi2_discr_2samp
 from .psi_homogeneity_test import psi_discr_2samp, psi_cont_2samp, sec_min
 
 
-def gen_sample(x: np.ndarray, samp_size: int, replace: bool = False):
+def gen_sample(x: np.ndarray, samp_size: int, replace: bool = False) -> np.ndarray:
     """
     This function generates subsample of given size from main sample (without replaces by default).
 
@@ -24,7 +25,7 @@ def gen_sample(x: np.ndarray, samp_size: int, replace: bool = False):
     return samp
 
 
-def bootstrap(x1: np.ndarray, x2: np.ndarray, bootstrap_num: int, samp_size: int, test: callable):
+def bootstrap(x1: np.ndarray, x2: np.ndarray, bootstrap_num: int, samp_size: int, test: Callable) -> float:
     """
     This function runs same test many times on subsamples of main 2 samples.
     Counted pvalues are used to get average estimate of pvalue. (Bootstrap idea).
@@ -53,7 +54,7 @@ def bootstrap(x1: np.ndarray, x2: np.ndarray, bootstrap_num: int, samp_size: int
     return pvalue
 
 
-def fillna_discr(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False):
+def fillna_discr(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -> Any:
     """
     This function fills missing values in x1 and x2 safely for homogeneity tests.
     It guarantees that missing values will be filled with unique constant.
@@ -93,7 +94,7 @@ def fillna_discr(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False):
         return x1, x2, 'nan'
 
 
-def fillna_cont(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False):
+def fillna_cont(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -> Any:
     """
     This function fills missing values in x1 and x2 safely for homogeneity tests.
     In case when nan value is just set to some constant less than all elements
@@ -168,7 +169,7 @@ def __init__(self, pval_thresh: float, samp_size: int, bootstrap_num: int):
         self.samp_size = samp_size
         self.bootstrap_num = bootstrap_num
 
-    def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False):
+    def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -> List:
         """
         Runs all discrete tests for two samples: 'chi2', 'psi'.
 
@@ -198,8 +199,21 @@ def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False)
         if x1_ref.dtype != x2_ref.dtype:
             raise TypeError("x1 and x2 must be of same data type.")
 
-        if x1_ref.dtype not in [int, float, object]:
-            raise TypeError("Only int, float or object datatypes are supported as x1/x2.dtype.")
+        if x1_ref.dtype not in [
+            'int8',
+            'int16',
+            'int32',
+            'int64',
+            'uint8',
+            'uint16',
+            'uint32',
+            'uint64',
+            'float16',
+            'float32',
+            'float64',
+            'object',
+        ]:
+            raise TypeError(f"Only int, float or object datatypes are supported as x1/x2.dtype. Got {x1_ref.dtype}.")
 
         if (x1_ref.shape[0] < self.samp_size) or (x2_ref.shape[0] < self.samp_size):
             raise ValueError("Sizes of x1 and x2 must be not less than 'samp_size' attribute.")
@@ -287,7 +301,7 @@ def __init__(self, pval_thresh: float, samp_size: int, bootstrap_num: int, psi_b
         self.bootstrap_num = bootstrap_num
         self.psi_bins = psi_bins
 
-    def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False):
+    def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -> List:
         """
         Runs all continuous tests for two samples: 'ks', 'cr-vonmis', 'epps-sing', 'psi'.
 
@@ -317,8 +331,21 @@ def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False)
         if x1_ref.dtype != x2_ref.dtype:
             raise TypeError("x1 and x2 must be of same data type.")
 
-        if x1_ref.dtype not in [int, float, object]:
-            raise TypeError("Only int, float or object datatypes are supported as x1/x2.dtype.")
+        if x1_ref.dtype not in [
+            'int8',
+            'int16',
+            'int32',
+            'int64',
+            'uint8',
+            'uint16',
+            'uint32',
+            'uint64',
+            'float16',
+            'float32',
+            'float64',
+            'object',
+        ]:
+            raise TypeError(f"Only int, float or object datatypes are supported as x1/x2.dtype. Got {x1_ref.dtype}.")
 
         if (x1_ref.shape[0] < self.samp_size) or (x2_ref.shape[0] < self.samp_size):
             raise ValueError("Sizes of x1 and x2 must be not less than 'samp_size' attribute.")

diff --git a/insolver/feature_monitoring/psi_homogeneity_test.py b/insolver/feature_monitoring/psi_homogeneity_test.py
@@ -1,9 +1,10 @@
 import numpy as np
 from math import inf
 from collections import defaultdict
+from typing import Iterable, Union
 
 
-def sec_min(x):
+def sec_min(x: Iterable) -> Union[float, int]:
     """
     This function counts second minimum of an array.
 
@@ -23,7 +24,7 @@ def sec_min(x):
     return min2
 
 
-def psi_cont_2samp(x1: np.ndarray, x2: np.ndarray, nan_value: float = -1.0, buckets: int = 20):
+def psi_cont_2samp(x1: np.ndarray, x2: np.ndarray, nan_value: float = -1.0, buckets: int = 20) -> float:
     """
     This function counts population stability index (PSI)
     between two samples of continuous variables.
@@ -47,7 +48,7 @@ def psi_cont_2samp(x1: np.ndarray, x2: np.ndarray, nan_value: float = -1.0, buck
     # build grid for histograms
     min_ = min(np.min(x1), np.min(x2))
     max_ = max(np.max(x1), np.max(x2))
-    grid = []
+    grid = np.array([])
     if min_ > nan_value:
         grid = np.linspace(min_, max_, buckets + 1)
     elif min_ == nan_value:
@@ -70,7 +71,7 @@ def psi_cont_2samp(x1: np.ndarray, x2: np.ndarray, nan_value: float = -1.0, buck
     return psi_value
 
 
-def psi_discr_2samp(x1: np.ndarray, x2: np.ndarray):
+def psi_discr_2samp(x1: np.ndarray, x2: np.ndarray) -> float:
     """
     This function counts psi_value between two samples of discrete variables.
 

diff --git a/tests/test_ContinuousHomogeneityTests.py b/tests/test_ContinuousHomogeneityTests.py
@@ -1,10 +1,10 @@
+import os
+import pytest
 import numpy as np
 import pandas as pd
 from scipy import stats as sps
-import pytest
 from insolver.feature_monitoring import ContinuousHomogeneityTests
 from insolver.feature_monitoring import psi_cont_2samp
-import os
 from insolver.model_tools import download_dataset
 
 

diff --git a/tests/test_DiscreteHomogeneityTests.py b/tests/test_DiscreteHomogeneityTests.py
@@ -1,9 +1,9 @@
+import os
+import pytest
 import numpy as np
 import pandas as pd
 from scipy import stats as sps
-import pytest
 from insolver.feature_monitoring import DiscreteHomogeneityTests
-import os
 from insolver.model_tools import download_dataset