-
Notifications
You must be signed in to change notification settings - Fork 0
/
bivariate_analysis.py
201 lines (164 loc) · 10.1 KB
/
bivariate_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
from variables import *
from data_menegement import primary_data_management
import seaborn
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi
import pandas as pd
import scipy.stats
from itertools import combinations
from tabulate import tabulate
import os
from common_functions import dict_to_table
CLIMATE_REGION_CATEGORY_ORDER = ['Alaska', 'Central', 'Northeast', 'Northwest', 'South', 'Southeast', 'Southwest',
'Upper_Midwest', 'West', 'West_North_Central', 'Other']
def bar_chart(e_var, e_label, r_var, r_label, data_set):
order = None if e_var != "climate_region" else CLIMATE_REGION_CATEGORY_ORDER
# creates bar chart with confidence intervals
seaborn.catplot(x=r_var, y=e_var, orient='h', order=order, data=data_set, kind="bar")
plt.xlabel(r_label)
plt.ylabel(e_label)
plt.show()
def scatter_plot(e_var, e_label, r_var, r_label, data_set):
# creates scatterplot
seaborn.regplot(x=e_var, y=r_var, data=data_set)
plt.xlabel(e_label)
plt.ylabel(r_label)
plt.show()
def chi_square(e_var, r_var, data_set):
# performs chi-square test
tab = pd.crosstab(data_set[r_var], data_set[e_var])
column_sum = tab.sum(axis=0)
percentage_tab = 100 * tab / column_sum
test_result = scipy.stats.chi2_contingency(tab)
return test_result, percentage_tab
def analysis_of_variance(e_var, r_var, data_set):
# performs ANOVA
model = smf.ols(formula="%s~C(%s)" % (r_var, e_var), data=data_set)
return model.fit()
def post_hoc_tukey(data_set, e_var, r_var):
# performs tukey post hos test
m_comparison = multi.MultiComparison(data_set[r_var], data_set[e_var])
return m_comparison.tukeyhsd()
def comparison_dict_to_table(comparison_dict, e_var='', r_var='', method=''):
# the result of post hoc test is presented as a dict with tuple of 2 categories and some value if they are
# statistically different
# retrieving all existing categories
categories = set([i for i, _ in comparison_dict] + [j for _, j in comparison_dict])
categories = sorted(categories)
# counter will show which categories are more different from other ones
counter = {key: len(categories) - 1 for key in categories}
counter["MAXIMUM NUMBER OF DIFFERENCES"] = len(categories) - 1
table = [['*'] + list(categories)] # creating the header line of the future table
for i in categories:
table.append([i]) # adding the first element
for j in categories:
if (i, j) in comparison_dict:
value = comparison_dict[i, j] # adding p-value or mean difference (depending of the test)
elif (j, i) in comparison_dict:
# adding p-value or mean difference (depending of the test)
value = comparison_dict[j, i] if method != "ANOVA" else str(-1 * float(comparison_dict[j, i]))
elif i == j:
value = '*' # marking the same categories
else:
value = '_' # marking pairs who are no proofed to be different
counter[i] -= 1 # subtract one if the categoy is in the pair
table[-1].append(value)
csv = '\n'.join([','.join(line) for line in table]) # creating text for csv file
if not os.path.exists("results"): # creating the directory if it doesn't exist
os.makedirs("results")
with open(f"results/{method}_response_{r_var}__explanatory_{e_var}.csv", 'w') as f:
f.write(csv) # writing csv into a file
# creating the result tables to show in the console
text_table = tabulate(table[1:], headers=table[0]) if len(table) > 0 else ''
counter = dict_to_table(counter)
return text_table, counter
def post_hoc_chi(data_set, e_var, r_var):
# performing post hoc test with Bonferroni adjustment
# works only if the response variable has two categories
e_values = data_set[e_var].unique() # retrieving all explanatory categories
pairs = [i for i in combinations(e_values, 2)] # create the list of combinations between two categories
comparison_dict = {}
for e_pair in pairs:
# leaving only two categories
df = data_set[(data_set[e_var] == e_pair[0]) | (data_set[e_var] == e_pair[1])].copy()
# performing chi_square again
pair_result, _ = chi_square(e_var, r_var, df)
if pair_result[1] < CRITICAL_VALUE / len(pairs):
# if we reject null hypothesis, add it to comparison dict with p_value
comparison_dict[e_pair] = str(pair_result[1])
return comparison_dict
def post_hoc_anova_to_dict(df):
# transforming the dataframe with the result of post hoc tukey to comparison dictionary
comparison_dict = {}
for _, row in df.iterrows():
comparison_dict[row["group1"], row["group2"]] = str(row["meandiff"])
return comparison_dict
def post_hoc_true(tukey_result):
# creating a data frame with the data of post hoc test where the null hypothesis is rejected
df = pd.DataFrame(data=tukey_result._results_table.data[1:], # creating a data frame with the results
columns=tukey_result._results_table.data[0])
results = df[df["reject"] == True] # throwing away the results which are not successful
return post_hoc_anova_to_dict(results[["group1", "group2", "meandiff"]])
def bivariate_analysis(data_set, e_q_variables, e_cat_variables, r_q_variables, r_cat_variables):
# performs bivariate analysis
for r_var in r_q_variables: # dealing with quantitative response variables
for e_var in e_cat_variables: # dealing with categorical explanatory variables
print("===================================================================================================")
print(f"The response variable is {r_var}. The explanatory variable is {e_var}. ANOVA")
# showing result as a bar chart
bar_chart(e_var, e_cat_variables.get(e_var, e_var), r_var, r_q_variables.get(r_var, r_var), data_set)
print(data_set[[r_var, e_var]].groupby(e_var).mean())
model_fit = analysis_of_variance(e_var, r_var, data_set) # testing the model
summary = model_fit.summary() # printing the summary
print(summary)
if model_fit.f_pvalue < CRITICAL_VALUE and len(data_set[e_var].unique()) > 2:
# if we reject null hypothesis and have than 2 categories
# we need post host hoc test
post_hoc_test = post_hoc_tukey(data_set, e_var, r_var)
post_hoc_rejected = post_hoc_true(post_hoc_test) # taking only the pairs where we reject H0
text_table, counter = comparison_dict_to_table(post_hoc_rejected, e_var, r_var, "ANOVA")
print(text_table) # printing the post hoc test result the mean difference in response var shown if
# the result between the pair is confident
print(counter) # printing the post hoc test result
for e_var in e_q_variables: # dealing with quantitative explanatory variables
print("===================================================================================================")
print(f"The response variable is {r_var}. The explanatory variable is {e_var}. Pearson correlation")
scatter_plot(e_var, e_cat_variables.get(e_var, e_var), r_var, r_cat_variables.get(r_var, r_var),
data_set) # printing the scatterplot
print("R, p-value")
print(scipy.stats.pearsonr(data_set[e_var], data_set[r_var])) # performing Person correlation
if r_cat_variables and e_q_variables: # if response is categorical and explanatory is quantitative
# we need to transform explanatory variables to categorical
# will be added later if needed
pass
for r_var in r_cat_variables: # dealing with categorical response variables
for e_var in e_cat_variables: # dealing with categorical explanatory variables
print("===================================================================================================")
print(f"The response variable is {r_var}. The explanatory variable is {e_var}. Chi-square")
# showing result as a bar chart
bar_chart(e_var, e_cat_variables.get(e_var, e_var), r_var, r_cat_variables.get(r_var, r_var), data_set)
result, tab = chi_square(e_var, r_var, data_set) # making chi-square test
print(tab)
print("chi-square value, p-value")
print(result[:2])
if len(result) >= 2 and result[1] < CRITICAL_VALUE: # if p-value lower than critical value...
post_hoc_test = post_hoc_chi(data_set, e_var, r_var) # performing the post hoc test
text_table, counter = comparison_dict_to_table(post_hoc_test, e_var, r_var, 'Chi-Square')
print(text_table) # printing the post hoc test result p-value are shown in they are
# lower than p-value after Bonferroni adjustment
print(counter) # printing the post hoc test result
if __name__ == "__main__":
# retrieving all variables for the research
categorical_explanatory_variables = retrieve_cat_explanatory_vars()
categorical_response_variables = retrieve_cat_response_variables()
quantitative_explanatory_variables = retrieve_quantitative_explanatory_vars()
quantitative_response_variables = retrieve_q_response_variables()
# preforming data management, adding new variables (more details in variables.py or in the report.pdf)
data = primary_data_management(categorical_explanatory_variables, categorical_response_variables)
bivariate_analysis(data, {}, categorical_explanatory_variables, {}, categorical_response_variables)
# performing data management with data where property is damaged and evaluated clear
data = primary_data_management({**categorical_explanatory_variables, **quantitative_explanatory_variables},
quantitative_response_variables, """(data_set["damage_property"] > 0)""")
bivariate_analysis(data, quantitative_explanatory_variables, categorical_explanatory_variables,
quantitative_response_variables, {})