-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #187 from MindSetLib/feat-monit
Add homogeneity tests for feature monitoring and unittests for this code
- Loading branch information
Showing
9 changed files
with
690,798 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
from .homogeneity_tests import ContinuousHomogeneityTests | ||
from .homogeneity_tests import DiscreteHomogeneityTests | ||
from .homogeneity_tests import fillna_cont | ||
from .homogeneity_tests import fillna_discr | ||
|
||
from .psi_homogeneity_test import psi_discr_2samp | ||
from .psi_homogeneity_test import psi_cont_2samp | ||
from .psi_homogeneity_test import sec_min | ||
|
||
from .chi2_homogeneity_test import chi2_discr_2samp | ||
|
||
from .homogeneity_report import chart_cont | ||
from .homogeneity_report import chart_discr | ||
from .homogeneity_report import HomogeneityReport | ||
from .homogeneity_report import render_report |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import numpy as np | ||
from scipy import stats as sps | ||
from collections import defaultdict | ||
|
||
|
||
class Chi2Result: | ||
""" | ||
This class is made for returning result of chi-square test in scipy style | ||
(like a structure with two named fields). | ||
Parameters: | ||
statistic (float): value of counted chi-square statistic. | ||
pvalue (float): pvalue corresponding to this statistic. | ||
""" | ||
|
||
def __init__(self, statistic: float, pvalue: float): | ||
self.statistic = statistic | ||
self.pvalue = pvalue | ||
|
||
|
||
def chi2_discr_2samp(x1: np.ndarray, x2: np.ndarray) -> "Chi2Result": | ||
""" | ||
This function runs chi-square test checking homogeneity of two samples | ||
of discrete variables. | ||
Parameters: | ||
x1 (np.array): sample from base period. | ||
x2 (np.array): sample from current period. | ||
Returns: | ||
res (Chi2Result): object containing counted statistic and corresponding pvalue. | ||
""" | ||
|
||
n1, n2 = len(x1), len(x2) | ||
|
||
# find unique categories and their frequencies in both arrays | ||
cats1, counts1 = np.unique(x1, return_counts=True) | ||
counts1 = defaultdict(int, zip(cats1, counts1)) | ||
|
||
cats2, counts2 = np.unique(x2, return_counts=True) | ||
counts2 = defaultdict(int, zip(cats2, counts2)) | ||
|
||
cats = np.union1d(cats1, cats2) | ||
num_cats = len(cats) | ||
|
||
# if both samples consist of only one constant | ||
# value we consider statistic to be zero | ||
if num_cats == 1: | ||
return Chi2Result(0.0, 1.0) | ||
|
||
# calculate statistic | ||
chi2 = 0.0 | ||
for cat in cats: | ||
mu_i = counts1[cat] | ||
nu_i = counts2[cat] | ||
chi2 += ((mu_i / n1 - nu_i / n2) ** 2) / (mu_i + nu_i) | ||
chi2 *= n1 * n2 | ||
|
||
# count pvalue | ||
pvalue = 1 - sps.chi2.cdf(chi2, num_cats - 1) | ||
res = Chi2Result(chi2, pvalue) | ||
return res |
Oops, something went wrong.