-
Notifications
You must be signed in to change notification settings - Fork 0
/
ml_functions.py
201 lines (156 loc) · 6.53 KB
/
ml_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 7 19:25:23 2021
@author: Jeswin
"""
from collections import Counter
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
sns.set(style="ticks", rc={'figure.figsize':(7,6)})
sns.set_context(rc = {"font.size":15, "axes.labelsize":15}, font_scale=2)
sns.set_palette('colorblind');
from pandas.api.types import CategoricalDtype
# pandas defaults
np.set_printoptions(precision=4)
pd.set_option('display.float_format', lambda x: '%.4f' % x)
import warnings
warnings.filterwarnings('ignore')
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax, chi2_contingency, chi2, entropy
# ---------------------------------------------------------------------------------------------------------------------------------
def phi_coefficient(a,b):
"""
Used to find the phi-coefficient between 2 nominal binary variables. Its analogus to correlation in continuous variables.
Parameters:
Binary nominal categorical/object pandas series: a,b
Returns:
float: returning value
"""
temp = pd.crosstab(a,b)
nr = (temp.iloc[1,1] * temp.iloc[0,0]) - (temp.iloc[0,1]*temp.iloc[1,0])
rs = np.prod(temp.apply(sum, axis = 'index').to_list(), dtype=np.int64)
cs = np.prod(temp.apply(sum, axis = 'columns').to_list(), dtype=np.int64)
return(round(nr/np.sqrt(rs*cs),4))
# ---------------------------------------------------------------------------------------------------------------------------------
def cramers_v(a,b):
"""
Used to find the phi-coefficient between 2 nominal binary variables. Its analogus to correlation in continuous variables.
Parameters:
Binary nominal categorical/object pandas series: a,b
Returns:
float: returning value
"""
crosstab = pd.crosstab(a,b)
chi2v = chi2_contingency(crosstab)[0] # chi-squared value
n = crosstab.sum().sum()
phi2 = chi2v/n
r, k = crosstab.shape
phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
rcorr = r-((r-1)**2)/(n-1)
kcorr = k-((k-1)**2)/(n-1)
return(np.sqrt(phi2corr/min((kcorr-1),(rcorr-1))))
# ---------------------------------------------------------------------------------------------------------------------------------
def generate_heatmap(df):
"""
Generate a heatmap with the upper triangular matrix masked
Compute the correlation matrix
Parameters:
Pandas dataframe having numerical values
Returns:
float: heatmap with correlation between 2 the numerical values
"""
corr = df.corr(method="spearman")
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=np.bool))
plt.figure(figsize = (15,9));
# Draw the heatmap with the mask
sns.heatmap(corr, mask=mask, cmap='coolwarm', fmt = '.2f', linewidths=.5, annot = True);
plt.title("Correlation heatmap");
return
# ---------------------------------------------------------------------------------------------------------------------------------
def conditional_entropy(x, y):
"""
Calculates the conditional entropy of x given y: S(x|y)
Wikipedia: https://en.wikipedia.org/wiki/Conditional_entropy
:param x: list / NumPy ndarray / Pandas Series
A sequence of measurements
:param y: list / NumPy ndarray / Pandas Series
A sequence of measurements
:return: float
"""
# entropy of x given y
y_counter = Counter(y)
xy_counter = Counter(list(zip(x,y)))
total_occurrences = sum(y_counter.values())
entropy = 0.0
for xy in xy_counter.keys():
p_xy = xy_counter[xy] / total_occurrences
p_y = y_counter[xy[1]] / total_occurrences
entropy += p_xy * math.log(p_y/p_xy)
return entropy
# ---------------------------------------------------------------------------------------------------------------------------------
def theils_u(x, y):
s_xy = conditional_entropy(x,y)
x_counter = Counter(x)
total_occurrences = sum(x_counter.values())
p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))
s_x = entropy(p_x)
if s_x == 0:
return(1)
else:
return((s_x - s_xy)/s_x)
# ---------------------------------------------------------------------------------------------------------------------------------
def boxplot_and_histogram_plot(ser):
"""
Cut the figure window in 2 halves. For the numerical series creates a boxplot and histogram.
Parameters:
Pandas series having numerical values
Returns:
Boxplot and histogram
"""
fig, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
# Add a graph in each part
sns.boxplot(ser, ax = ax_box)
sns.distplot(ser, ax = ax_hist)
# Remove x axis name for the boxplot
ax_box.set(xlabel='');
sns.despine(ax=ax_hist);
sns.despine(ax=ax_box, left=True);
# ---------------------------------------------------------------------------------------------------------------------------------
def detect_outliers(df, n, features):
"""
Takes a dataframe df of features and returns a list of the indices
corresponding to the observations containing more than n outliers according
to the Tukey method.
Courtesy: https://www.kaggle.com/abisheksudarshan/titanic-top-10-solution
Parameters:
df - pandas dataframe
n - number of occurences of outliers in a single row
features - column names
Returns:
list of indices having outliers
"""
outlier_indices = []
# iterate over features(columns)
for col in features:
# 1st quartile (25%)
Q1 = np.percentile(df[col], 25)
# 3rd quartile (75%)
Q3 = np.percentile(df[col],75)
# Interquartile range (IQR)
IQR = Q3 - Q1
# outlier step
outlier_step = 1.5 * IQR
# Determine a list of indices of outliers for feature col
outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
# append the found outlier indices for col to the list of outlier indices
outlier_indices.extend(outlier_list_col)
# select observations containing more than 2 outliers
outlier_indices = Counter(outlier_indices)
multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
return multiple_outliers
# ---------------------------------------------------------------------------------------------------------------------------------