From 5f62c117178867dea04e2d7f5d757e1c4a4adfe2 Mon Sep 17 00:00:00 2001
From: alexmindset <alex@mind-set.ru>
Date: Sat, 3 Dec 2022 21:21:07 +0300
Subject: [PATCH] Failing test under Windows fixed. Type annotation.

---
 .../chi2_homogeneity_test.py                  |  2 +-
 .../feature_monitoring/homogeneity_report.py  | 34 +++++++-------
 .../feature_monitoring/homogeneity_tests.py   | 47 +++++++++++++++----
 .../psi_homogeneity_test.py                   |  9 ++--
 tests/test_ContinuousHomogeneityTests.py      |  4 +-
 tests/test_DiscreteHomogeneityTests.py        |  4 +-
 6 files changed, 65 insertions(+), 35 deletions(-)

diff --git a/insolver/feature_monitoring/chi2_homogeneity_test.py b/insolver/feature_monitoring/chi2_homogeneity_test.py
index d2ad84b..4ee25d4 100644
--- a/insolver/feature_monitoring/chi2_homogeneity_test.py
+++ b/insolver/feature_monitoring/chi2_homogeneity_test.py
@@ -18,7 +18,7 @@ def __init__(self, statistic: float, pvalue: float):
         self.pvalue = pvalue
 
 
-def chi2_discr_2samp(x1: np.ndarray, x2: np.ndarray):
+def chi2_discr_2samp(x1: np.ndarray, x2: np.ndarray) -> "Chi2Result":
     """
     This function runs chi-square test checking homogeneity of two samples
     of discrete variables.
diff --git a/insolver/feature_monitoring/homogeneity_report.py b/insolver/feature_monitoring/homogeneity_report.py
index a4f6f6a..41792f8 100644
--- a/insolver/feature_monitoring/homogeneity_report.py
+++ b/insolver/feature_monitoring/homogeneity_report.py
@@ -1,20 +1,20 @@
+import os
+import inspect
 import numpy as np
 import pandas as pd
-from plotly.figure_factory import create_distplot
-from plotly import express as px
 import plotly as py
-import os
+import jinja2
 from os.path import dirname
-import inspect
+from typing import List, Sequence, Dict, Union
+from plotly.figure_factory import create_distplot
+from plotly import express as px
 
 from .homogeneity_tests import ContinuousHomogeneityTests, DiscreteHomogeneityTests, fillna_cont, fillna_discr
 
-import jinja2
-
 
 def chart_cont(
-    x1: np.ndarray, x2: np.ndarray, name1: str, name2: str, limits: iter, bins: int = 15, offline: bool = True
-):
+    x1: np.ndarray, x2: np.ndarray, name1: str, name2: str, limits: Sequence, bins: int = 15, offline: bool = True
+) -> py.graph_objs.Figure:
     """
     This function draws histograms of given samples using joint grid.
     It needs limits of interested area and number of bins.
@@ -71,7 +71,7 @@ def chart_cont(
         return fig
 
 
-def chart_discr(x1: np.ndarray, x2: np.ndarray, name1: str, name2: str, offline: bool = True):
+def chart_discr(x1: np.ndarray, x2: np.ndarray, name1: str, name2: str, offline: bool = True) -> py.graph_objs.Figure:
     """
     This function draws histograms of given samples using joint grid.
     It needs limits of interested area and number of bins.
@@ -144,15 +144,15 @@ def __init__(self, config_dict_inp: dict):
         self.config_dict = config_dict_inp
 
     @property
-    def features(self):
-        return self.__config_dict.keys()
+    def features(self) -> List:
+        return list(self.__config_dict.keys())
 
     @property
-    def config_dict(self):
+    def config_dict(self) -> Dict:
         return self.__config_dict
 
     @config_dict.setter
-    def config_dict(self, config_dict_inp):
+    def config_dict(self, config_dict_inp: Dict) -> None:
         """
         Raises:
             ValueError: if config_dict is empty. It must have some features.
@@ -182,7 +182,7 @@ def build_report(
         name1: str = 'Base subset',
         name2: str = 'Current subset',
         draw_charts: bool = False,
-    ):
+    ) -> List:
         """
         Main function which assembles all testing logic - it takes raw dataframes
         and runs homogeneity tests for features. Feature set and properties are took from config dict.
@@ -270,7 +270,9 @@ def build_report(
                 x1, x2, _ = fillna_cont(x1, x2, inplace=True)
 
                 # run tests
-                homogen_tester = ContinuousHomogeneityTests(pval_thresh, samp_size, bootstrap_num, psi_bins)
+                homogen_tester: Union[
+                    'ContinuousHomogeneityTests', 'DiscreteHomogeneityTests'
+                ] = ContinuousHomogeneityTests(pval_thresh, samp_size, bootstrap_num, psi_bins)
                 test_results = homogen_tester.run_all(x1, x2, inplace=True)
 
                 # optional drawing of charts
@@ -328,7 +330,7 @@ def build_report(
         return report_data
 
 
-def render_report(report_data: list, report_path: str = 'homogeneity_report.html'):
+def render_report(report_data: list, report_path: str = 'homogeneity_report.html') -> None:
     """
     This is a separate function to render reports built by 'HomogeneityReport' class.
     Several report data lists can be concatenated and passed to this function.
diff --git a/insolver/feature_monitoring/homogeneity_tests.py b/insolver/feature_monitoring/homogeneity_tests.py
index 321431b..dc03d50 100644
--- a/insolver/feature_monitoring/homogeneity_tests.py
+++ b/insolver/feature_monitoring/homogeneity_tests.py
@@ -2,11 +2,12 @@
 import pandas as pd
 from scipy import stats as sps
 from sklearn.preprocessing import LabelEncoder
+from typing import Callable, List, Any
 from .chi2_homogeneity_test import chi2_discr_2samp
 from .psi_homogeneity_test import psi_discr_2samp, psi_cont_2samp, sec_min
 
 
-def gen_sample(x: np.ndarray, samp_size: int, replace: bool = False):
+def gen_sample(x: np.ndarray, samp_size: int, replace: bool = False) -> np.ndarray:
     """
     This function generates subsample of given size from main sample (without replaces by default).
 
@@ -24,7 +25,7 @@ def gen_sample(x: np.ndarray, samp_size: int, replace: bool = False):
     return samp
 
 
-def bootstrap(x1: np.ndarray, x2: np.ndarray, bootstrap_num: int, samp_size: int, test: callable):
+def bootstrap(x1: np.ndarray, x2: np.ndarray, bootstrap_num: int, samp_size: int, test: Callable) -> float:
     """
     This function runs same test many times on subsamples of main 2 samples.
     Counted pvalues are used to get average estimate of pvalue. (Bootstrap idea).
@@ -53,7 +54,7 @@ def bootstrap(x1: np.ndarray, x2: np.ndarray, bootstrap_num: int, samp_size: int
     return pvalue
 
 
-def fillna_discr(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False):
+def fillna_discr(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -> Any:
     """
     This function fills missing values in x1 and x2 safely for homogeneity tests.
     It guarantees that missing values will be filled with unique constant.
@@ -93,7 +94,7 @@ def fillna_discr(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False):
         return x1, x2, 'nan'
 
 
-def fillna_cont(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False):
+def fillna_cont(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -> Any:
     """
     This function fills missing values in x1 and x2 safely for homogeneity tests.
     In case when nan value is just set to some constant less than all elements
@@ -168,7 +169,7 @@ def __init__(self, pval_thresh: float, samp_size: int, bootstrap_num: int):
         self.samp_size = samp_size
         self.bootstrap_num = bootstrap_num
 
-    def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False):
+    def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -> List:
         """
         Runs all discrete tests for two samples: 'chi2', 'psi'.
 
@@ -198,8 +199,21 @@ def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False)
         if x1_ref.dtype != x2_ref.dtype:
             raise TypeError("x1 and x2 must be of same data type.")
 
-        if x1_ref.dtype not in [int, float, object]:
-            raise TypeError("Only int, float or object datatypes are supported as x1/x2.dtype.")
+        if x1_ref.dtype not in [
+            'int8',
+            'int16',
+            'int32',
+            'int64',
+            'uint8',
+            'uint16',
+            'uint32',
+            'uint64',
+            'float16',
+            'float32',
+            'float64',
+            'object',
+        ]:
+            raise TypeError(f"Only int, float or object datatypes are supported as x1/x2.dtype. Got {x1_ref.dtype}.")
 
         if (x1_ref.shape[0] < self.samp_size) or (x2_ref.shape[0] < self.samp_size):
             raise ValueError("Sizes of x1 and x2 must be not less than 'samp_size' attribute.")
@@ -287,7 +301,7 @@ def __init__(self, pval_thresh: float, samp_size: int, bootstrap_num: int, psi_b
         self.bootstrap_num = bootstrap_num
         self.psi_bins = psi_bins
 
-    def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False):
+    def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -> List:
         """
         Runs all continuous tests for two samples: 'ks', 'cr-vonmis', 'epps-sing', 'psi'.
 
@@ -317,8 +331,21 @@ def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False)
         if x1_ref.dtype != x2_ref.dtype:
             raise TypeError("x1 and x2 must be of same data type.")
 
-        if x1_ref.dtype not in [int, float, object]:
-            raise TypeError("Only int, float or object datatypes are supported as x1/x2.dtype.")
+        if x1_ref.dtype not in [
+            'int8',
+            'int16',
+            'int32',
+            'int64',
+            'uint8',
+            'uint16',
+            'uint32',
+            'uint64',
+            'float16',
+            'float32',
+            'float64',
+            'object',
+        ]:
+            raise TypeError(f"Only int, float or object datatypes are supported as x1/x2.dtype. Got {x1_ref.dtype}.")
 
         if (x1_ref.shape[0] < self.samp_size) or (x2_ref.shape[0] < self.samp_size):
             raise ValueError("Sizes of x1 and x2 must be not less than 'samp_size' attribute.")
diff --git a/insolver/feature_monitoring/psi_homogeneity_test.py b/insolver/feature_monitoring/psi_homogeneity_test.py
index c80117f..61b7b9a 100644
--- a/insolver/feature_monitoring/psi_homogeneity_test.py
+++ b/insolver/feature_monitoring/psi_homogeneity_test.py
@@ -1,9 +1,10 @@
 import numpy as np
 from math import inf
 from collections import defaultdict
+from typing import Iterable, Union
 
 
-def sec_min(x):
+def sec_min(x: Iterable) -> Union[float, int]:
     """
     This function counts second minimum of an array.
 
@@ -23,7 +24,7 @@ def sec_min(x):
     return min2
 
 
-def psi_cont_2samp(x1: np.ndarray, x2: np.ndarray, nan_value: float = -1.0, buckets: int = 20):
+def psi_cont_2samp(x1: np.ndarray, x2: np.ndarray, nan_value: float = -1.0, buckets: int = 20) -> float:
     """
     This function counts population stability index (PSI)
     between two samples of continuous variables.
@@ -47,7 +48,7 @@ def psi_cont_2samp(x1: np.ndarray, x2: np.ndarray, nan_value: float = -1.0, buck
     # build grid for histograms
     min_ = min(np.min(x1), np.min(x2))
     max_ = max(np.max(x1), np.max(x2))
-    grid = []
+    grid = np.array([])
     if min_ > nan_value:
         grid = np.linspace(min_, max_, buckets + 1)
     elif min_ == nan_value:
@@ -70,7 +71,7 @@ def psi_cont_2samp(x1: np.ndarray, x2: np.ndarray, nan_value: float = -1.0, buck
     return psi_value
 
 
-def psi_discr_2samp(x1: np.ndarray, x2: np.ndarray):
+def psi_discr_2samp(x1: np.ndarray, x2: np.ndarray) -> float:
     """
     This function counts psi_value between two samples of discrete variables.
 
diff --git a/tests/test_ContinuousHomogeneityTests.py b/tests/test_ContinuousHomogeneityTests.py
index 93e3520..79b6974 100644
--- a/tests/test_ContinuousHomogeneityTests.py
+++ b/tests/test_ContinuousHomogeneityTests.py
@@ -1,10 +1,10 @@
+import os
+import pytest
 import numpy as np
 import pandas as pd
 from scipy import stats as sps
-import pytest
 from insolver.feature_monitoring import ContinuousHomogeneityTests
 from insolver.feature_monitoring import psi_cont_2samp
-import os
 from insolver.model_tools import download_dataset
 
 
diff --git a/tests/test_DiscreteHomogeneityTests.py b/tests/test_DiscreteHomogeneityTests.py
index 8e25f52..9d6f67b 100644
--- a/tests/test_DiscreteHomogeneityTests.py
+++ b/tests/test_DiscreteHomogeneityTests.py
@@ -1,9 +1,9 @@
+import os
+import pytest
 import numpy as np
 import pandas as pd
 from scipy import stats as sps
-import pytest
 from insolver.feature_monitoring import DiscreteHomogeneityTests
-import os
 from insolver.model_tools import download_dataset